XNU内核（八）BSD系统调用过程代码简单分析

（一）首先，系统调用有两种方式：

　　　　0x80、0x81、0x82三个中断号；
　　　　专门指令（至少分Intel架构和ARM架构），比如SYSENTER/SYSCALL

（二）话分两头，先说中断向量方式

　　这是终端向量定义的部分代码：

INTERRUPT(0x7d)
INTERRUPT(0x7e)
USER_TRAP(0x7f, idt64_dtrace_ret) /* Required by dtrace "fasttrap" */

USER_TRAP_SPC(0x80,idt64_unix_scall)
USER_TRAP_SPC(0x81,idt64_mach_scall)
USER_TRAP_SPC(0x82,idt64_mdep_scall)

INTERRUPT(0x83)
INTERRUPT(0x84)
INTERRUPT(0x85)
INTERRUPT(0x86)

　　（BSD风格的系统调用，终端号就是0x80）

　　触发中断以及后面的逻辑，都在汇编文件idt64.s中实现，下面简单看看：

/*
 * System call handlers.
 * These are entered via a syscall interrupt. The system call number in %rax
 * is saved to the error code slot in the stack frame. We then branch to the
 * common state saving code.
 */

#ifndef UNIX_INT
#error NO UNIX INT!!!
#endif
Entry(idt64_unix_scall)
    swapgs                /* switch to kernel gs (cpu_data) */
    pushq    %rax            /* save system call number */
    PUSH_FUNCTION(HNDL_UNIX_SCALL)
    pushq    $(UNIX_INT)

　　接下来执行PUSH_FUNCTIOIN(HNDL_UNIX_SCALL)，先展开PUSH_FUNCTION看看：

#if 1
#define PUSH_FUNCTION(func)                  sub    $8, %rsp            ;\
    push    %rax                ;\
    leaq    func(%rip), %rax        ;\
    movq    %rax, 8(%rsp)            ;\
    pop    %rax
#else
#define PUSH_FUNCTION(func) pushq func
#endif

　　系统调用号，在寄存器RAX，接下来看看HNDL_UNIX_SCALL：

Entry(hndl_unix_scall)

        TIME_TRAP_UENTRY

    movq    %gs:CPU_ACTIVE_THREAD,%rcx    /* get current thread     */
    movq    TH_TASK(%rcx),%rbx        /* point to current task  */
    incl    TH_SYSCALLS_UNIX(%rcx)        /* increment call count   */

    /* Check for active vtimers in the current task */
    TASK_VTIMER_CHECK(%rbx,%rcx)

    sti

    CCALL1(unix_syscall, %r15)
    /*
     * always returns through thread_exception_return
     */

　　主要有一行：unix_syscall，看看unix_syscall函数的definition：

/*
 * Function:    unix_syscall
 *
 * Inputs:    regs    - pointer to i386 save area
 *
 * Outputs:    none
 */
void
unix_syscall(x86_saved_state_t *state)
{
    thread_t        thread;
    void            *vt;
    unsigned int        code;
    struct sysent        *callp;

    int            error;
    vm_offset_t        params;
    struct proc        *p;
    struct uthread        *uthread;
    x86_saved_state32_t    *regs;
    boolean_t        is_vfork;

    assert(is_saved_state32(state));
    regs = saved_state32(state);
#if DEBUG
    if (regs->eax == 0x800)
        thread_exception_return();
#endif
    thread = current_thread();
    uthread = get_bsdthread_info(thread);

    /* Get the approriate proc; may be different from task‘s for vfork() */
    is_vfork = uthread->uu_flag & UT_VFORK;
    if (__improbable(is_vfork != 0))
        p = current_proc();
    else
        p = (struct proc *)get_bsdtask_info(current_task());

    /* Verify that we are not being called from a task without a proc */
    if (__improbable(p == NULL)) {
        regs->eax = EPERM;
        regs->efl |= EFL_CF;
        task_terminate_internal(current_task());
        thread_exception_return();
        /* NOTREACHED */
    }

    code = regs->eax & I386_SYSCALL_NUMBER_MASK;
    DEBUG_KPRINT_SYSCALL_UNIX("unix_syscall: code=%d(%s) eip=%u\n",
                              code, syscallnames[code >= NUM_SYSENT ? 63 : code], (uint32_t)regs->eip);
    params = (vm_offset_t) (regs->uesp + sizeof (int));

    regs->efl &= ~(EFL_CF);

    callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];

    if (__improbable(callp == sysent)) {
        code = fuword(params);
        params += sizeof(int);
        callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
    }

.........

　　通过寄存器中的数据得到code，再通过code取得数组sysent中的系统调用函数，交给callp；后面的代码冗长，这里就不全部贴出来咯。

　　（关于sysent数组，改天详述）

　　（三）再说系统调用专用指令方式（以Intel架构为例）

　　SYSENTER用于32位，SYSCALL用于64位，只说SYSCALL吧，先看汇编：

Entry(hi64_syscall)
Entry(idt64_syscall)
L_syscall_continue:
    swapgs                /* Kapow! get per-cpu data area */
    mov    %rsp, %gs:CPU_UBER_TMP    /* save user stack */
    mov    %gs:CPU_UBER_ISF, %rsp    /* switch stack to pcb */

    /*
     * Save values in the ISF frame in the PCB
     * to cons up the saved machine state.
     */
    movl    $(USER_DS), ISF64_SS(%rsp)
    movl    $(SYSCALL_CS), ISF64_CS(%rsp)    /* cs - a pseudo-segment */
    mov    %r11, ISF64_RFLAGS(%rsp)    /* rflags */
    mov    %rcx, ISF64_RIP(%rsp)        /* rip */
    mov    %gs:CPU_UBER_TMP, %rcx
    mov    %rcx, ISF64_RSP(%rsp)        /* user stack */
    mov    %rax, ISF64_ERR(%rsp)        /* err/rax - syscall code */
    movq    $(T_SYSCALL), ISF64_TRAPNO(%rsp)    /* trapno */
    leaq    HNDL_SYSCALL(%rip), %r11;
    movq    %r11, ISF64_TRAPFN(%rsp)
    mov    ISF64_RFLAGS(%rsp), %r11    /* Avoid leak, restore R11 */
    jmp    L_dispatch_U64            /* this can only be 64-bit */

　　主要看看HNDL_SYSCALL：

/*
 * 64bit Tasks
 * System call entries via syscall only:
 *
 *    r15     x86_saved_state64_t
 *    rsp     kernel stack
 *
 *    both rsp and r15 are 16-byte aligned
 *    interrupts disabled
 *    direction flag cleared
 */

Entry(hndl_syscall)
    TIME_TRAP_UENTRY

    movq    %gs:CPU_ACTIVE_THREAD,%rcx    /* get current thread     */
    movq    TH_TASK(%rcx),%rbx        /* point to current task  */

    /* Check for active vtimers in the current task */
    TASK_VTIMER_CHECK(%rbx,%rcx)

    /*
     * We can be here either for a mach, unix machdep or diag syscall,
     * as indicated by the syscall class:
     */
    movl    R64_RAX(%r15), %eax        /* syscall number/class */
    movl    %eax, %edx
    andl    $(SYSCALL_CLASS_MASK), %edx    /* syscall class */
    cmpl    $(SYSCALL_CLASS_MACH<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_mach_scall64)
    cmpl    $(SYSCALL_CLASS_UNIX<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_unix_scall64)
    cmpl    $(SYSCALL_CLASS_MDEP<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_mdep_scall64)
    cmpl    $(SYSCALL_CLASS_DIAG<<SYSCALL_CLASS_SHIFT), %edx
    je    EXT(hndl_diag_scall64)

    /* Syscall class unknown */
    sti
    CCALL3(i386_exception, $(EXC_SYSCALL), %rax, $1)
    /* no return */

　　可以看到，这里根据寄存器和全局参数区分4种系统调用，BSD风格的系统调用只是第1种，还有3种：mach syscall、machdep syscall、diag syscall；

　　如果是BSD风格系统调用，那么就继续执行hndl_unix_scall64：

Entry(hndl_unix_scall64)
    incl    TH_SYSCALLS_UNIX(%rcx)        /* increment call count   */
    sti

    CCALL1(unix_syscall64, %r15)
    /*
     * always returns through thread_exception_return
     */

　　只有一个函数调用，unix_syscall64，接下来看看这个函数的definition：

void
unix_syscall64(x86_saved_state_t *state)
{
    thread_t    thread;
    unsigned int    code;
    struct sysent    *callp;
    void        *uargp;
    int        args_in_regs;
    int        error;
    struct proc    *p;
    struct uthread    *uthread;
    x86_saved_state64_t *regs;

    assert(is_saved_state64(state));
    regs = saved_state64(state);
#if    DEBUG
    if (regs->rax == 0x2000800)
        thread_exception_return();
#endif
    thread = current_thread();
    uthread = get_bsdthread_info(thread);

    /* Get the approriate proc; may be different from task‘s for vfork() */
    if (__probable(!(uthread->uu_flag & UT_VFORK)))
        p = (struct proc *)get_bsdtask_info(current_task());
    else
        p = current_proc();

    /* Verify that we are not being called from a task without a proc */
    if (__improbable(p == NULL)) {
        regs->rax = EPERM;
        regs->isf.rflags |= EFL_CF;
        task_terminate_internal(current_task());
        thread_exception_return();
        /* NOTREACHED */
    }
    args_in_regs = 6;

    code = regs->rax & SYSCALL_NUMBER_MASK;
    DEBUG_KPRINT_SYSCALL_UNIX(
        "unix_syscall64: code=%d(%s) rip=%llx\n",
        code, syscallnames[code >= NUM_SYSENT ? 63 : code], regs->isf.rip);
    callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
    uargp = (void *)(&regs->rdi);

    if (__improbable(callp == sysent)) {
            /*
         * indirect system call... system call number
         * passed as ‘arg0‘
         */
            code = regs->rdi;
        callp = (code >= NUM_SYSENT) ? &sysent[63] : &sysent[code];
        uargp = (void *)(&regs->rsi);
        args_in_regs = 5;
    }

..........

　　可以看到这里首先从x86_saved_state_t中取得系统调用号code，然后从数组sysent中得到系统调用函数，给callp；再后面是一些参数处理，和callp的执行。

　　接下去就到了具体的系统调用函数。

　　（大概介绍如上，有人拍砖吗？一起了解啊～）

时间： 2024-10-13 18:48:43

XNU内核（八）BSD系统调用过程代码简单分析的相关文章

Netty源码分析（八）----- write过程源码分析

上一篇文章主要讲了netty的read过程,本文主要分析一下write和writeAndFlush. 主要内容本文分以下几个部分阐述一个java对象最后是如何转变成字节流,写到socket缓冲区中去的 pipeline中的标准链表结构 java对象编码过程 write:写队列 flush:刷新写队列 writeAndFlush: 写队列并刷新 pipeline中的标准链表结构一个标准的pipeline链式结构如下数据从head节点流入,先拆包,然后解码成业务对象,最后经过业务Handler

对于HTML页面中CSS, JS, HTML的加载与执行过程的简单分析

最近在研究HTML页面中JavaScript的执行顺序问题.在JavaScript中,定义一个方法或者函数有很多方式,最常见的有2中,function语句式与函数直接量方式. 对于function语句式,解释器会优先解释.即加载了这个js文件后,会扫描一下所有的js代码,然后把该优先执行的东西先执行了,然后再从上到下按顺序执行.所以,定义的代码可以在执行的代码后边.就跟C#中的方法定义一样.解释器已经记住了这个方法,知道在内存中的哪里,用的时候直接去取就行了. C#语言是,对象中的属性与方法具有

kernel-2.6.32-431.el6.src.rpm内核源码安装过程和问题分析

一:安装内核源码包:.src.rpm和.rpm包的安装方式完全不同,可以通过rpm -qpl 查询rpm内容 #rpm -ivh kernel-2.6.32-431.el6.src.rpm warning: user mockbuild does not exist - using root warning: group mockbuild does not exist - using root 直接忽略. 二:没有在/usr/src生成内核源码,原因如下: .src.rpm安装包将会在/roo

2019-举例跟踪分析Linux内核5.0系统调用处理过程

简介学号520 实验环境基于ubuntu18.04 选择系统调用号20 getpid()分析实验目的学会使用gdb工具跟踪linux内核函数调用学会使用C代码和嵌入式汇编使用系统中断分析system_call中断处理过程实验步骤 1.下载linux5.0.1内核并编译 wget https://mirrors.aliyun.com/linux-kernel/v5.x/linux-5.0.1.tar.xz xz -d linux-5.0.1.tar.xz tar -xvf linux-

[文件系统]文件系统学习笔记（八）---mount系统调用（代码相关）

一,mount系统调用--相关代码源码位置:kernel/fs/Namespace.c文件的do_mount()函数, [cpp] view plaincopy long do_mount(char *dev_name,char*dir_name,char*type_page,unsigned long flags,void *data_page) dev_name指的是要挂载文件系统的名字,如tmpfs,dir_name指的是文件系统要被挂载的目标目录type_page指的是要挂载的文件系统的

分析Linux内核5.0系统调用处理过程

学号: 363 本实验来源 https://github.com/mengning/linuxkernel/ 一.实验要求 1.编译内核5.02.qemu -kernel linux-5.0.1/arch/x86/boot/bzImage -initrd rootfs.img3.选择系统调用号后两位与您的学号后两位相同的系统调用进行跟踪分析https://github.com/mengning/menu4.给出相关关键源代码及实验截图,撰写一篇博客(署真实姓名或学号最后3位编号),并在博客文章中

Linux0.11内核系列—2.系统调用机制分析

[版权所有,转载请注明出处.出处:http://www.cnblogs.com/joey-hua/p/5570691.html ] Linux内核从启动到初始化也看了好些个源码文件了,这次看到kernel文件夹下的system_call.s,这个文件主要就是系统调用的过程.但说到系统调用,不只是这一个文件这么简单,里面牵扯到的内容太多,这里就做个笔记记录一下从建立中断到最终调用系统调用的完整机制. 假设就从write这个函数作为系统调用来解释. 系统调用的本质就是用户进程需要访问内核级别的代码,

动静结合学内核之 linux 系统调用浅析

刘柳 + 原创作品转载请注明出处 + <Linux内核分析>MOOC课程http://mooc.study.163.com/course/USTC-1000029000 直入主题实验过程1-增加新的菜单 update the menu //git cone Uhange main.c //注册菜单函数 Add the chmodC ,chmodASM //菜单函数的实现 Make rootfs //激动人心的时刻,进入menuos调用我们新函数 chmodASM代码讲述调用chmod系统调

Cocos2d-x 3.x 开发（十八）10行代码看自动Batch，10行代码看自动剔除

1.概述在游戏的运行过程中,图形的绘制是非常大的开销.对于良莠不齐的Android手机市场,绘制优化较好的游戏,可以在更多的手机上运行,因此也是优化的重中之重.图形方面的优化主要体现在减少GUP的绘制次数上.这里我们分别从自动优化渲染批次和绘制剔除两个方面来看新版本在绘制上的优化. 2.自动batch 在Cocos2d-x 3.x中,抛弃了先前手动编写BatchNode,采用自动管理的方式.说起BatchNode,就难免涉及到显卡底层的绘制原理.简单的说,每提交一条绘制指令到显卡都会产生消耗,