一、系统调用
系统调用是内核给用户程序提供的编程接口。用户程序可使用glibc库对单个系统提供的函数,或使用syscall ( ):。系统调用fork()为例:
SYSCALL_DEFINE0(fork) //展开后为 asmlinkage long sys_fork(void)
{
#ifdef CONFIG_MMU
return _do_fork(SIGCHLD,0,0,NULL,NULL,0)
#else
Return -EINVAL;
#endif
}
需要在系统调用表中保存系统调用号和处理函数的映射关系,sys_call_table如下:
/source/arch/x86/kernel/syscall_64.c
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] =
{
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.*/
[0 ... __NR_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_64.h>
};
二、执行系统调用
系统调用划分到同步异常,在异常级别1的异常向量表中,64位调用入口为el0_sync函数。
el0_sync:
kernel_entry 0
mrs x25, esr_el1 // 读异常情况寄存器
lsr x24, x25, #ESR_ELx_EC_SHIFT // 判断异常类别
cmp x24, #ESR_ELx_EC_SVC64 // 64-bit 系统调用
b.eq el0_svc //跳转到el0_svc
...
el0_svc负责执行系统调用,如果上层调用open系统调用打开文件时,就会从从sys_call_table,根据系统调用号,找到对应的sys_call_table元素,也即sys_open;并执行。代码如下:
sc_nr .reg x25 //系统调用数量
scno .req x26 //系统调用号
stbl .req x27 //系统调用表地址
Tsk .req x28 //当前进程的thread_info结构体的地址
el0_svc:
adrp stbl, sys_call_table // load syscall table pointer
uxtw scno, w8 // syscall number in w8
mov sc_nr, #__NR_syscalls //把寄存器x25设置为系统调用数量
el0_svc_naked: // compat entry point
stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number
enable_dbg //开启调试
enable_irq //开启中断
get_thread_info tsk
ldr x16, [tsk, #TI_FLAGS] // check for syscall tracing
tbnz x16, #TIF_SYSCALL_TRACE, __sys_trace // are we tracing syscalls?
adr lr, ret_fast_syscall // return address,用于返回用户空间
cmp scno, sc_nr // check upper syscall limit
b.hs ni_sys //如果超过系统调用限制,跳转到ni_sys 处理
ldr x16, [stbl, scno, lsl #3] // address in the syscall table
br x16 // call sys_* routine
ni_sys: //超过调用数量限制的处理
mov x0, sp
b do_ni_syscall
ENDPROC(el0_svc)
ret_fast_syscall从系统调用返回用户空间,代码如下
/*
* This is the fast syscall return path. We do as little as possible here,
* and this includes saving x0 back into the kernel stack.
*/
ret_fast_syscall:
disable_irq // disable interrupts
str x0, [sp, #S_X0] // returned x0
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing
and x2, x1, #_TIF_SYSCALL_WORK
cbnz x2, ret_fast_syscall_trace
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
enable_step_tsk x1, x2
kernel_exit 0
ret_fast_syscall_trace:
enable_irq // enable interrupts
b __sys_trace_return_skipped // we already saved x0
/*
* Ok, we need to do extra processing, enter the slow path.
*/
work_pending:
mov x0, sp // 'regs'
bl do_notify_resume
#ifdef CONFIG_TRACE_IRQFLAGS
bl trace_hardirqs_on // enabled while in userspace
#endif
ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step
b finish_ret_to_user
/*
* "slow" syscall return path.
*/
ret_to_user:
disable_irq // disable interrupts
ldr x1, [tsk, #TSK_TI_FLAGS]
and x2, x1, #_TIF_WORK_MASK
cbnz x2, work_pending
finish_ret_to_user:
enable_step_tsk x1, x2
kernel_exit 0
ENDPROC(ret_to_user)
work_pending调用do_notify_resume函数,代码如下:
asmlinkage void do_notify_resume(struct pt_regs *regs,
unsigned int thread_flags)
{
/*
* The assembly code enters us with IRQs off, but it hasn't
* informed the tracing code of that for efficiency reasons.
* Update the trace code with the current status.
*/
trace_hardirqs_off();
do {
if (thread_flags & _TIF_NEED_RESCHED) {
schedule();
} else {
local_irq_enable();
if (thread_flags & _TIF_UPROBE)
uprobe_notify_resume(regs);
if (thread_flags & _TIF_SIGPENDING)
do_signal(regs);
if (thread_flags & _TIF_NOTIFY_RESUME) {
clear_thread_flag(TIF_NOTIFY_RESUME);
tracehook_notify_resume(regs);
}
if (thread_flags & _TIF_FOREIGN_FPSTATE)
fpsimd_restore_current_state();
}
local_irq_disable();
thread_flags = READ_ONCE(current_thread_info()->flags);
} while (thread_flags & _TIF_WORK_MASK);
}