From: Laurent Vivier , Jeff Dike , Paolo 'Blaisorblade' Giarrusso Adds a new ptrace(2) mode, called PTRACE_SYSEMU (or PTRACE_SYSEMU, this is yet to decide properly), resembling PTRACE_SYSCALL except that the kernel does not execute the requested syscall; this is useful for virtual environments, like UML, which want to run the syscall on their own. This patch includes some suggestions of Jeff Dike to avoid adding any instructions to the syscall fast path, plus some other little changes, by myself, to make it work even when the syscall is executed with SYSENTER (but I'm unsure about them). DESC Rename PTRACE_SCEMU to PTRACE_SYSEMU, to match the guest patch. EDESC Rename PTRACE_SCEMU to PTRACE_SYSEMU, to match the guest SYSEMU patch. DESC SYSEMU: fix behaviour when changing state EDESC In do_syscall_trace, we check the status of the TIF_SYSCALL_EMU flag only after doing the debugger notification; but the debugger might have changed the status of this flag because he continued execution with PTRACE_SYSCALL, so this is buggy. This patch fixes it by saving the flag status before calling ptrace_notify. DESC SYSEMU: avoid intercepting syscall on return when using SYSCALL again. EDESC From: Bodo Stroesser A guest process switching from using PTRACE_SYSEMU to PTRACE_SYSCALL crashes. The problem is in arch/i386/kernel/entry.S. The current SYSEMU patch inhibits the syscall-handler to be called, but does not prevent do_syscall_trace to be called after this for syscall completion interception. The appended patch fixes this. It reuses the flag TIF_SYSCALL_EMU to remember "we come from PTRACE_SYSEMU and now are in PTRACE_SYSCALL", since the flag is unused in the depicted situation. The patch is tested, AFAICS, it works fine, i.e. sysemu can be switched on and off dynamically without crash. Bodo DESC Fix PTRACE_SINGLESTEP after PTRACE_SYSEMU EDESC From: Bodo Stroesser When testing 2.6.9 and the skas3.v6 patch, with my latest patch and had problems with singlestepping on UML in SKAS with SYSEMU. It looped receiving SIGTRAPs without moving forward. EIP of the traced process was the same for all SIGTRAPs. What's missing is to handle switching from PTRACE_SYSCALL_EMU to PTRACE_SINGLESTEP in a way very similar to what is done for the change from PTRACE_SYSCALL_EMU to PTRACE_SYSCALL_TRACE. I.e., after calling ptrace(PTRACE_SYSEMU), on the return path, the debugger is notified and then wake ups the process; the syscall is executed (or skipped, when do_syscall_trace returns 0, i.e. when using PTRACE_SYSEMU), and do_syscall_trace is called again. Since we are on the return path of a SYSEMU'd syscall, if the wake up is performed through ptrace(PTRACE_SYSCALL), we must still avoid notifying the parent of the syscall exit. Now, this behaviour is extended even to resuming with PTRACE_SINGLESTEP. Signed-off-by: Paolo 'Blaisorblade' Giarrusso --- clean-linux-2.6.11-paolo/arch/i386/kernel/entry.S | 9 ++- clean-linux-2.6.11-paolo/arch/i386/kernel/ptrace.c | 44 +++++++++++++--- clean-linux-2.6.11-paolo/include/asm-i386/thread_info.h | 5 + clean-linux-2.6.11-paolo/include/linux/ptrace.h | 1 clean-linux-2.6.11-paolo/kernel/fork.c | 3 + 5 files changed, 52 insertions(+), 10 deletions(-) diff -puN arch/i386/kernel/entry.S~host-sysemu-2.6.7-4 arch/i386/kernel/entry.S --- clean-linux-2.6.11/arch/i386/kernel/entry.S~host-sysemu-2.6.7-4 2005-07-10 16:55:19.000000000 +0200 +++ clean-linux-2.6.11-paolo/arch/i386/kernel/entry.S 2005-07-10 16:57:38.000000000 +0200 @@ -219,7 +219,7 @@ sysenter_past_esp: SAVE_ALL GET_THREAD_INFO(%ebp) - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testb $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -242,8 +242,8 @@ ENTRY(system_call) pushl %eax # save orig_eax SAVE_ALL GET_THREAD_INFO(%ebp) - # system call tracing in operation - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + # system call tracing in operation / emulation + testb $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -304,6 +304,9 @@ syscall_trace_entry: movl %esp, %eax xorl %edx,%edx call do_syscall_trace + cmpl $0, %eax + jne syscall_exit # ret != 0 -> running under PTRACE_SYSEMU, + # so must skip actual syscall movl ORIG_EAX(%esp), %eax cmpl $(nr_syscalls), %eax jnae syscall_call diff -puN arch/i386/kernel/ptrace.c~host-sysemu-2.6.7-4 arch/i386/kernel/ptrace.c --- clean-linux-2.6.11/arch/i386/kernel/ptrace.c~host-sysemu-2.6.7-4 2005-07-10 16:55:19.000000000 +0200 +++ clean-linux-2.6.11-paolo/arch/i386/kernel/ptrace.c 2005-07-10 16:57:41.000000000 +0200 @@ -507,15 +507,27 @@ asmlinkage int sys_ptrace(long request, } break; + case PTRACE_SYSEMU: /* continue and stop at next syscall, which will not be executed */ case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */ case PTRACE_CONT: /* restart after signal. */ ret = -EIO; if ((unsigned long) data > _NSIG) break; + /* If we came here with PTRACE_SYSEMU and now continue with + * PTRACE_SYSCALL, entry.S used to intercept the syscall return. + * But it shouldn't! + * So we don't clear TIF_SYSCALL_EMU, which is always unused in + * this special case, to remember, we came from SYSEMU. That + * flag will be cleared by do_syscall_trace(). + */ + if (request == PTRACE_SYSEMU) { + set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + } else if (request == PTRACE_CONT) { + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + } if (request == PTRACE_SYSCALL) { set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); - } - else { + } else { clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); } child->exit_code = data; @@ -544,6 +556,8 @@ asmlinkage int sys_ptrace(long request, ret = -EIO; if ((unsigned long) data > _NSIG) break; + /*See do_syscall_trace to know why we don't clear + * TIF_SYSCALL_EMU.*/ clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); set_singlestep(child); child->exit_code = data; @@ -676,8 +690,9 @@ void send_sigtrap(struct task_struct *ts * - triggered by current->work.syscall_trace */ __attribute__((regparm(3))) -void do_syscall_trace(struct pt_regs *regs, int entryexit) +int do_syscall_trace(struct pt_regs *regs, int entryexit) { + int is_sysemu, is_systrace, is_singlestep; if (unlikely(current->audit_context)) { if (!entryexit) audit_syscall_entry(current, regs->orig_eax, @@ -686,19 +701,34 @@ void do_syscall_trace(struct pt_regs *re else audit_syscall_exit(current, regs->eax); } - if (!(current->ptrace & PT_PTRACED)) - return; + return 0; + + is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); + is_systrace = test_thread_flag(TIF_SYSCALL_TRACE); + is_singlestep = test_thread_flag(TIF_SINGLESTEP); + + /* We can detect the case of coming from PTRACE_SYSEMU and now running + * with PTRACE_SYSCALL or PTRACE_SINGLESTEP, by TIF_SYSCALL_EMU being + * set additionally. + * If so let's reset the flag and return without action (no singlestep + * nor syscall tracing, since no actual step has been executed). + */ + if (is_sysemu && (is_systrace || is_singlestep)) { + clear_thread_flag(TIF_SYSCALL_EMU); + return 0; + } /* Fake a debug trap */ if (test_thread_flag(TIF_SINGLESTEP)) send_sigtrap(current, regs, 0); if (!test_thread_flag(TIF_SYSCALL_TRACE)) - return; + return 0; /* the 0x80 provides a way for the tracing parent to distinguish between a syscall stop and SIGTRAP delivery */ + /* Note that the debugger could change the result of test_thread_flag!*/ ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); /* @@ -710,4 +740,6 @@ void do_syscall_trace(struct pt_regs *re send_sig(current->exit_code, current, 1); current->exit_code = 0; } + /* != 0 if nullifying the syscall, 0 if running it normally */ + return is_sysemu; } diff -puN include/asm-i386/thread_info.h~host-sysemu-2.6.7-4 include/asm-i386/thread_info.h --- clean-linux-2.6.11/include/asm-i386/thread_info.h~host-sysemu-2.6.7-4 2005-07-10 16:55:19.000000000 +0200 +++ clean-linux-2.6.11-paolo/include/asm-i386/thread_info.h 2005-07-10 16:55:19.000000000 +0200 @@ -139,6 +139,7 @@ register unsigned long current_stack_poi #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* restore singlestep on return to user mode */ #define TIF_IRET 5 /* return with iret */ +#define TIF_SYSCALL_EMU 6 /* syscall emulation active */ #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_POLLING_NRFLAG 16 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_MEMDIE 17 @@ -149,12 +150,14 @@ register unsigned long current_stack_poi #define _TIF_NEED_RESCHED (1<