#ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsignedlong)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsignedlong)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif
/* Flags to clear on syscall */ wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); }
然后将当前栈顶(用户空间栈顶)记录在 CPU 独占变量区域里,将 CPU 独占区域里记录的内核栈顶放入 rsp/esp。
最后通过 push 保存各寄存器值……
syscall里面的细节我们不探究,直接看从syscall返回那部分
1 2 3 4 5 6 7 8 9 10 11 12 13
LOCKDEP_SYS_EXIT // 宏的实现与 CONFIG_DEBUG_LOCK_ALLOC 内核配置选项相关,该配置允许在退出系统调用时调试锁。 TRACE_IRQS_ON /* user mode is traced as IRQs on */ movqRIP(%rsp), %rcx movq EFLAGS(%rsp), %r11
ENTRY(entry_SYSCALL_64) /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK // KPTI 进内核态需要切到内核页表 SWITCH_KERNEL_CR3_NO_STACK /* * A hypervisor implementation might want to use a label * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ GLOBAL(entry_SYSCALL_64_after_swapgs) // 将用户栈偏移保存到 per-cpu 变量 rsp_scratch 中 movq %rsp, PER_CPU_VAR(rsp_scratch) // 加载内核栈偏移 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. */ movq PER_CPU_VAR(current_task), %r11 testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz entry_SYSCALL64_slow_path
entry_SYSCALL_64_fastpath: /* * Easy case: enable interrupts and issue the syscall. If the syscall * needs pt_regs, we'll call a stub that disables interrupts again * and jumps to the slow path. */ TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) #if __SYSCALL_MASK == ~0 // 确保系统调用号没超过最大值,超过了则跳转到后面的符号 1 处进行返回 cmpq $__NR_syscall_max, %rax #else andl $__SYSCALL_MASK, %eax cmpl $__NR_syscall_max, %eax #endif ja 1f /* return -ENOSYS (already in pt_regs->ax) */ // 除系统调用外的其他调用都通过 rcx 来传第四个参数,因此将 r10 的内容设置到 rcx movq %r10, %rcx
/* * This call instruction is handled specially in stub_ptregs_64. * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ // 调用系统调用表中对应的函数 call *sys_call_table(, %rax, 8) .Lentry_SYSCALL_64_after_fastpath_call: // 将函数返回值压到栈中,返回时弹出 movq %rax, RAX(%rsp) 1:
/* * If we get here, then we know that pt_regs is clean for SYSRET64. * If we see that no exit work is required (which we are required * to check with IRQs off), then we can go straight to SYSRET64. */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF movq PER_CPU_VAR(current_task), %r11 testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz 1f
LOCKDEP_SYS_EXIT // 宏的实现与 CONFIG_DEBUG_LOCK_ALLOC 内核配置选项相关,该配置允许在退出系统调用时调试锁。 TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 RESTORE_C_REGS_EXCEPT_RCX_R11 // 恢复除 rxc 和 r11 外所有通用寄存器, 因为 rcx 寄存器为调用系统调用的应用程序的返回地址, r11 寄存器为老的 flags register /* * This opens a window where we have a user CR3, but are * running in the kernel. This makes using the CS * register useless for telling whether or not we need to * switch CR3 in NMIs. Normal interrupts are OK because * they are off here. */ SWITCH_USER_CR3 // KPTI 返回用户态需要切回用户页表 /* 根据压栈的内容,恢复 rsp 为用户态的栈顶 */ movq RSP(%rsp), %rsp USERGS_SYSRET64 /* 调用宏 USERGS_SYSRET64 ,其扩展调用 swapgs 指令交换用户 GS 和内核GS, sysret 指令执行从系统调用处理退出 */
/* * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
/* Copy the IRET frame to the trampoline stack. */ pushq 6*8(%rdi)/* SS */ pushq 5*8(%rdi)/* RSP */ pushq 4*8(%rdi)/* EFLAGS */ pushq 3*8(%rdi)/* CS */ pushq 2*8(%rdi)/* RIP */
/* Push user RDI on the trampoline stack. */ pushq(%rdi)
/* * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ STACKLEAK_ERASE_NOCLOBBER