xorl %eax, %eax

Linux kernel ASLR Implementation

with 4 comments

Since June 2005 (specifically 2.6.12), Linux kernel has build-in ASLR (Address space Layout Randomization) support. In this post I’m trying to give a brief description of how this is implemented. However, I will be focusing more to the x86 architecture since this protection mechanism leads to some architecture dependent issues.

Random Number Generation
The PRNG used for ASLR of Linux kernel is the get_random_int() routine as we’ll see later in this post. This function is located at drivers/char/random.c file and it is shown below.

static struct keydata {
        __u32 count; /* already shifted to the final position */
        __u32 secret[12];
} ____cacheline_aligned ip_keydata[2];
  ...
/*
 * Get a random word for internal kernel use only. Similar to urandom but
 * with the goal of minimal entropy pool depletion. As a result, the random
 * value is not cryptographically secure but for several uses the cost of
 * depleting entropy is too high
 */
DEFINE_PER_CPU(__u32 [4], get_random_int_hash);
unsigned int get_random_int(void)
{
        struct keydata *keyptr;
        __u32 *hash = get_cpu_var(get_random_int_hash);
        int ret;

        keyptr = get_keyptr();
        hash[0] += current->pid + jiffies + get_cycles();

        ret = half_md4_transform(hash, keyptr->secret);
        put_cpu_var(get_random_int_hash);

        return ret;
}

After defining some per-processor values, it uses the get_cpu_var()/put_cpu_var() C macros to get and store the random hash to the processor specific array. This is leaving us with get_keyptr() which initializes the ‘keydata’ structure and the actual random number generation.
The ‘keydata’ initialization is performed using this C function:

static struct keydata {
        __u32 count; /* already shifted to the final position */
        __u32 secret[12];
} ____cacheline_aligned ip_keydata[2];

static unsigned int ip_cnt;
  ...
static inline struct keydata *get_keyptr(void)
{
        struct keydata *keyptr = &ip_keydata[ip_cnt & 1];

        smp_rmb();

        return keyptr;
}

The smp_rmb() macro is defined in arch/x86/include/asm/system.h header file for the x86 architecture and it stands for Read Memory Barrier.

/*
 * Force strict CPU ordering.
 * And yes, this is required on UP too when we're talking
 * to devices.
 */
#ifdef CONFIG_X86_32
/*
 * Some non-Intel clones support out of order store. wmb() ceases to be a
 * nop for these.
 */
#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
#else
#define mb()    asm volatile("mfence":::"memory")
#define rmb()   asm volatile("lfence":::"memory")
#define wmb()   asm volatile("sfence" ::: "memory")
#endif
  ...
#ifdef CONFIG_SMP
#define smp_mb()        mb()
#ifdef CONFIG_X86_PPRO_FENCE
# define smp_rmb()      rmb()
#else
# define smp_rmb()      barrier()
#endif

This is used to flush any pending read that subsequent reads depend on. As we can read in the same header file:

 * No data-dependent reads from memory-like regions are ever reordered
 * over this barrier.  All reads preceding this primitive are guaranteed
 * to access memory (but not necessarily other CPUs' caches) before any
 * reads following this primitive that depend on the data return by
 * any of the preceding reads.  This primitive is much lighter weight than
 * rmb() on most CPUs, and is never heavier weight than is
 * rmb().
 *
 * These ordering constraints are respected by both the local CPU
 * and the compiler.
 *
 * Ordering is not guaranteed by anything other than these primitives,
 * not even by data dependencies.  See the documentation for
 * memory_barrier() for examples and URLs to more information.

This ensures that ‘keyptr’ initialization doesn’t get reordered and back to get_random_int() we can now have a look at the exact random number generation code. According to:

hash[0] += current->pid + jiffies + get_cycles()

We have four different variables being involved. Those are:
– The address of the first element of the ‘hash[0]’ array
– The currently executing process ID for the processor that handles this
– The system’s jiffies value
– CPU cycles number
The last variable is derived from get_cycles() inline function that is defined at arch/x86/include/asm/tsc.h for the x86 architecture.

static inline cycles_t get_cycles(void)
{
        unsigned long long ret = 0;

#ifndef CONFIG_X86_TSC
        if (!cpu_has_tsc)
                return 0;
#endif
        rdtscll(ret);

        return ret;
}

This means that if the processor supports rdtsc instruction it will jump to arch/x86/include/asm/msr.h header file to execute the following C macro:

static __always_inline unsigned long long __native_read_tsc(void)
{
        DECLARE_ARGS(val, low, high);

        asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));

        return EAX_EDX_VAL(val, low, high);
}
  ...
#define rdtscll(val)                                            \
        ((val) = __native_read_tsc())

Which basically, simply executes the rdtsc instruction.
Back to get_random_int() we can see that even though there are a lot of difficult to guess variables being used to generate that pseudo-random integer, it also calls half_md4_transform() which is defined at lib/halfmd4.c and it implements a basic MD4 algorithm.

/* F, G and H are basic MD4 functions: selection, majority, parity */
#define F(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
#define G(x, y, z) (((x) & (y)) + (((x) ^ (y)) & (z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))
  ...
#define ROUND(f, a, b, c, d, x, s)      \
        (a += f(b, c, d) + x, a = (a << s) | (a >> (32 - s)))
#define K1 0
#define K2 013240474631UL
#define K3 015666365641UL
  ...
__u32 half_md4_transform(__u32 buf[4], __u32 const in[8])
{
        __u32 a = buf[0], b = buf[1], c = buf[2], d = buf[3];

        /* Round 1 */
        ROUND(F, a, b, c, d, in[0] + K1,  3);
        ROUND(F, d, a, b, c, in[1] + K1,  7);
        ROUND(F, c, d, a, b, in[2] + K1, 11);
        ROUND(F, b, c, d, a, in[3] + K1, 19);
        ROUND(F, a, b, c, d, in[4] + K1,  3);
        ROUND(F, d, a, b, c, in[5] + K1,  7);
        ROUND(F, c, d, a, b, in[6] + K1, 11);
        ROUND(F, b, c, d, a, in[7] + K1, 19);

        /* Round 2 */
        ROUND(G, a, b, c, d, in[1] + K2,  3);
        ROUND(G, d, a, b, c, in[3] + K2,  5);
        ROUND(G, c, d, a, b, in[5] + K2,  9);
        ROUND(G, b, c, d, a, in[7] + K2, 13);
        ROUND(G, a, b, c, d, in[0] + K2,  3);
        ROUND(G, d, a, b, c, in[2] + K2,  5);
        ROUND(G, c, d, a, b, in[4] + K2,  9);
        ROUND(G, b, c, d, a, in[6] + K2, 13);

        /* Round 3 */
        ROUND(H, a, b, c, d, in[3] + K3,  3);
        ROUND(H, d, a, b, c, in[7] + K3,  9);
        ROUND(H, c, d, a, b, in[2] + K3, 11);
        ROUND(H, b, c, d, a, in[6] + K3, 15);
        ROUND(H, a, b, c, d, in[1] + K3,  3);
        ROUND(H, d, a, b, c, in[5] + K3,  9);
        ROUND(H, c, d, a, b, in[0] + K3, 11);
        ROUND(H, b, c, d, a, in[4] + K3, 15);

        buf[0] += a;
        buf[1] += b;
        buf[2] += c;
        buf[3] += d;

        return buf[1]; /* "most hashed" word */
}

This makes things even more complex for anyone attempting to guess the resulted integer. Now that we have a basic understanding of the pseudo-random number generation routine utilized by the Linux ASLR implementation we can move on to the actual code that uses this.

brk(2) Randomization
At the fs/binfmt_elf.c is where the Linux kernel’s ELF loader is located. The routine that loads the actual executable binary file is the load_elf_binary() which among others includes the following code.

static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
{
        struct file *interpreter = NULL; /* to shut gcc up */
        unsigned long load_addr = 0, load_bias = 0;
  ...
#ifdef arch_randomize_brk
        if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1))
                current->mm->brk = current->mm->start_brk =
                        arch_randomize_brk(current->mm);
#endif
  ...
out_free_ph:
        kfree(elf_phdata);
        goto out;
}

This means that if the ‘arch_randomize_brk’ is defined it will check if the current process should have a randomized virtual address space using ‘PF_RANDOMIZE’ flag as well as if the ‘randomize_va_space’ is greater than 1. If this is the case, it will update its current starting data segment address using the return address of arch_randomize_brk().
The latter routine can be found at arch/x86/kernel/process.c for the x86 family.

unsigned long arch_randomize_brk(struct mm_struct *mm)
{
        unsigned long range_end = mm->brk + 0x02000000;
        return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
}

It calculates the end of the data segment by adding 0x02000000 to the starting address and then calling randomize_range() to randomize the given address space. This randomization routine is also placed in drivers/char/random.c and you can see it here:

/*
 * randomize_range() returns a start address such that
 *
 *    [...... <range> .....]
 *  start                  end
 *
 * a <range> with size "len" starting at the return value is inside in the
 * area defined by [start, end], but is otherwise randomized.
 */
unsigned long
randomize_range(unsigned long start, unsigned long end, unsigned long len)
{
        unsigned long range = end - len - start;

        if (end <= start + len)
                return 0;
        return PAGE_ALIGN(get_random_int() % range + start);
}

If the range is correct, it will invoke get_random_int() using the starting address and of course, the resulted value is aligned to the next page boundary as this is defined by the ‘PAGE_SIZE’ constant.

SYSCTL Interface
In the previous section we encountered a variable named ‘randomize_va_space’. As almost any Linux administrator knows, the Linux ASLR can be tuned using the ‘/proc/sys/vm/randomize_va_space’ or ‘kernel.randomize_va_space’ SYSCTL variable. In both cases the result is passing an integer value to the kernel as we can read at kernel/sysctl.c which is where this is defined.

static struct ctl_table kern_table[] = {
  ...
#if defined(CONFIG_MMU)
        {
                .procname       = "randomize_va_space",
                .data           = &randomize_va_space,
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
#endif
  ...
/*
 * NOTE: do not add new entries to this table unless you have read
 * Documentation/sysctl/ctl_unnumbered.txt
 */
        { }
};

The actual variable ‘randomize_va_space’ is placed in mm/memory.c as shown below.

/*
 * Randomize the address space (stacks, mmaps, brk, etc.).
 *
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )
 */
int randomize_va_space __read_mostly =
#ifdef CONFIG_COMPAT_BRK
                                        1;
#else
                                        2;
#endif

Here, the ‘__read_mostly’ modifier is an architecture specific attribute which in case of x86 processors is defined in arch/x86/include/asm/cache.h header file.

#define __read_mostly __attribute__((__section__(".data..read_mostly")))

This forces the variable to be placed in a section called .data.read_mostly that is designed for static variables which are initialized once and very rarely changed.
From the kernel developers’ comment we can also see that if the compatibility support option for brk(2) system call is enabled, it will not randomize it since it could break old versions of C library. Additionally, this variable is defined in SYSCTL’s kernel table as we can find at kernel/sysctl_binary.c file.

static const struct bin_table bin_kern_table[] = {
  ...
        { CTL_INT,      KERN_RANDOMIZE,                 "randomize_va_space" },
  ...
        {}
};

Which uses the ‘KERN_RANDOMIZE’ value as this was defined in include/linux/sysctl.h header file.

/* CTL_KERN names: */
enum
{
  ...
        KERN_RANDOMIZE=68, /* int: randomize virtual address space */
  ...
};

Now that we have a basic understanding of what is going on in the kernel when manipulating that variable through SYSCTL interface, we can move to the more interesting parts…

Stack Randomization
The actual stack randomization takes place in fs/exec.c and more specifically in the setup_arg_pages() routine which is responsible for the final stage of stack initialization before executing a binary. Here is a code snippet that demonstrates how the stack randomization is implemented…

/*
 * Finalizes the stack vm_area_struct. The flags and permissions are updated,
 * the stack is optionally relocated, and some extra space is added.
 */
int setup_arg_pages(struct linux_binprm *bprm,
                    unsigned long stack_top,
                    int executable_stack)
{
  ...
#ifdef CONFIG_STACK_GROWSUP
  ...
#else
        stack_top = arch_align_stack(stack_top);
        stack_top = PAGE_ALIGN(stack_top);
  ...
out_unlock:
        up_write(&mm->mmap_sem);
        return ret;
}

If the stack segment does not grow upwards, it will use arch_align_stack() passing the stack top address which was an argument of setup_arg_pages() routine. Then it will align the returned value in a page boundary and continue with the stack segment setup. Assuming that we’re dealing with an x86 architecture, the initial function call will lead to arch/x86/kernel/process.c file where we can find the following code.

unsigned long arch_align_stack(unsigned long sp)
{
        if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
                sp -= get_random_int() % 8192;
        return sp & ~0xf;
}

The check is fairly simple. If the currently executed task doesn’t have ‘ADDR_NO_RANDOMIZE’ personality set which is used to disable the randomization and the ‘randomize_va_space’ has a non-zero value, it will invoke get_random_int() to perform the stack randomization. Before moving on, for completeness here is the include/linux/personality.h header file’s definition of the above personality constant value.

/*
 * Flags for bug emulation.
 *
 * These occupy the top three bytes.
 */
enum {
        ADDR_NO_RANDOMIZE =     0x0040000,      /* disable randomization of VA space */
  ...
};

Back to arch_align_stack(), after decrementing the stack pointer with the random number in case of an ASLR supported task, it’ll align it by masking it with 0xfffffff0 on 32-bit processors. However, a quick look in fs/binfmt_elf.c shows that this is not that simple since this is how it’s implemented in the ELF loader’s code…

static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
{
        struct file *interpreter = NULL; /* to shut gcc up */
  ...
        /* Do this so that we can load the interpreter, if need be.  We will
           change some of these later */
        current->mm->free_area_cache = current->mm->mmap_base;
        current->mm->cached_hole_size = 0;
        retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
                                 executable_stack);
  ...
        goto out;
}

We can see here that it passes a randomized stack top pointer using the randomize_stack_top() routine from the same source code file.

#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12))     /* 8MB of VA */
#endif

static unsigned long randomize_stack_top(unsigned long stack_top)
{
        unsigned int random_variable = 0;

        if ((current->flags & PF_RANDOMIZE) &&
                !(current->personality & ADDR_NO_RANDOMIZE)) {
                random_variable = get_random_int() & STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
#ifdef CONFIG_STACK_GROWSUP
        return PAGE_ALIGN(stack_top) + random_variable;
#else
        return PAGE_ALIGN(stack_top) - random_variable;
#endif
}

Once again, the current process won’t be randomized if it doesn’t have ‘PF_RANDOMIZE’ flag and it has ‘ADDR_NO_RANDOMIZE’ personality set. Otherwise, it will use get_random_int() as well as the ‘STACK_RND_MASK’ to mask the returned integer. Although you see the definition of the latter constant in the given code snippet, it is originally defined in the architecture specific arch/x86/include/asm/elf.h header file.

#ifdef CONFIG_X86_32

#define STACK_RND_MASK (0x7ff)

This is pretty much the stack ASLR implementation of Linux.

mmap(2) Randomization
Before we dive into the mmap(2) randomization itself, what happens with mmap(2) allocations colliding with the randomized stack space?
So, to avoid such collisions with the stack randomized virtual address space, Linux kernel developers implemented the following routine in arch/x86/mm/mmap.c file.

static unsigned int stack_maxrandom_size(void)
{
        unsigned int max = 0;
        if ((current->flags & PF_RANDOMIZE) &&
                !(current->personality & ADDR_NO_RANDOMIZE)) {
                max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT;
        }

        return max;
}


/*
 * Top of mmap area (just below the process stack).
 *
 * Leave an at least ~128 MB hole with possible stack randomization.
 */
#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size())
#define MAX_GAP (TASK_SIZE/6*5)

After performing the usual checks on the currently executing task, it calculates the maximum randomized address based on the ‘STACK_RND_MASK’ value. Later on, inside mmap_base() we can see how the above C macros are used to ensure it doesn’t collide with the randomized space.

static unsigned long mmap_base(void)
{
        unsigned long gap = rlimit(RLIMIT_STACK);

        if (gap < MIN_GAP)
                gap = MIN_GAP;
        else if (gap > MAX_GAP)
                gap = MAX_GAP;

        return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
}

Here is also our first contact with the mmap(2) randomization routine which is, of course, through mmap_rnd(). This one is placed in arch/x86/mm/mmap.c and its code is this:

static unsigned long mmap_rnd(void)
{
        unsigned long rnd = 0;

       /*
        *  8 bits of randomness in 32bit mmaps, 20 address space bits
        * 28 bits of randomness in 64bit mmaps, 40 address space bits
        */
        if (current->flags & PF_RANDOMIZE) {
                if (mmap_is_ia32())
                        rnd = (long)get_random_int() % (1<<8);
                else
                        rnd = (long)(get_random_int() % (1<<28));
        }
        return rnd << PAGE_SHIFT;
}

Which is pretty self-explanatory code.

So, I believe this post should give readers a grasp on how Linux ASLR is implemented. I used 2.6.36 version of the Linux kernel so this might be useless for future releases but for now it is up-to-date. Any comments, corrections or suggestions are always welcome.

Written by xorl

January 16, 2011 at 21:09

Posted in linux, security

4 Responses

Subscribe to comments with RSS.

  1. thank you : )

    clnoe

    January 18, 2011 at 07:25

  2. This is not a basic MD4 algorithm. It’s step-reduced version of it. They do only 8 steps per round instead of 16 for performance reasons.

    gat3way

    January 27, 2011 at 22:34

  3. Cool

    badc0re

    February 6, 2011 at 16:08

  4. Thank you. Helped me a lot to understand the implementation.

    rizwan

    April 12, 2017 at 05:00


Leave a comment