--- linux/drivers/block/scsi_ioctl.c.orig +++ linux/drivers/block/scsi_ioctl.c @@ -319,7 +319,7 @@ static int sg_scsi_ioctl(request_queue_t return -EFAULT; if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) return -EINVAL; - if (get_user(opcode, sic->data)) + if (get_user(opcode, (int *)sic->data)) return -EFAULT; bytes = max(in_len, out_len); --- linux/include/asm-i386/atomic_kmap.h.orig +++ linux/include/asm-i386/atomic_kmap.h @@ -0,0 +1,92 @@ +/* + * atomic_kmap.h: temporary virtual kernel memory mappings + * + * Copyright (C) 2003 Ingo Molnar + */ + +#ifndef _ASM_ATOMIC_KMAP_H +#define _ASM_ATOMIC_KMAP_H + +#ifdef __KERNEL__ + +#include +#include + +#ifdef CONFIG_DEBUG_HIGHMEM +#define HIGHMEM_DEBUG 1 +#else +#define HIGHMEM_DEBUG 0 +#endif + +extern pte_t *kmap_pte; +#define kmap_prot PAGE_KERNEL + +static inline unsigned long __kmap_atomic_vaddr(enum km_type type) +{ + enum fixed_addresses idx; + + idx = type + KM_TYPE_NR*smp_processor_id(); + return __fix_to_virt(FIX_KMAP_BEGIN + idx); +} + +static inline void *__kmap_atomic_noflush(struct page *page, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + /* + * NOTE: entries that rely on some secondary TLB-flush + * effect must not be global: + */ + set_pte(kmap_pte-idx, mk_pte(page, PAGE_KERNEL)); + + return (void*) vaddr; +} + +static inline void *__kmap_atomic(struct page *page, enum km_type type) +{ + enum fixed_addresses idx; + unsigned long vaddr; + + idx = type + KM_TYPE_NR*smp_processor_id(); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); +#if HIGHMEM_DEBUG + BUG_ON(!pte_none(*(kmap_pte-idx))); +#else + /* + * Performance optimization - do not flush if the new + * pte is the same as the old one: + */ + if (pte_val(*(kmap_pte-idx)) == pte_val(mk_pte(page, kmap_prot))) + return (void *) vaddr; +#endif + set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + __flush_tlb_one(vaddr); + + return (void*) vaddr; +} + +static inline void __kunmap_atomic(void *kvaddr, enum km_type type) +{ +#if HIGHMEM_DEBUG + unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); + + BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); + /* + * force other mappings to Oops if they'll try to access + * this pte without first remap it + */ + pte_clear(kmap_pte-idx); + __flush_tlb_one(vaddr); +#endif +} + +#define __kunmap_atomic_type(type) \ + __kunmap_atomic((void *)__kmap_atomic_vaddr(type), (type)) + +#endif /* __KERNEL__ */ + +#endif /* _ASM_ATOMIC_KMAP_H */ --- linux/include/asm-i386/checksum.h.orig +++ linux/include/asm-i386/checksum.h @@ -25,7 +25,7 @@ asmlinkage unsigned int csum_partial(con * better 64-bit) boundary */ -asmlinkage unsigned int csum_partial_copy_generic( const char *src, char *dst, int len, int sum, +asmlinkage unsigned int direct_csum_partial_copy_generic( const char *src, char *dst, int len, int sum, int *src_err_ptr, int *dst_err_ptr); /* @@ -39,14 +39,19 @@ static __inline__ unsigned int csum_partial_copy_nocheck ( const char *src, char *dst, int len, int sum) { - return csum_partial_copy_generic ( src, dst, len, sum, NULL, NULL); + /* + * The direct function is OK for kernel-space => kernel-space copies: + */ + return direct_csum_partial_copy_generic ( src, dst, len, sum, NULL, NULL); } static __inline__ unsigned int csum_partial_copy_from_user ( const char *src, char *dst, int len, int sum, int *err_ptr) { - return csum_partial_copy_generic ( src, dst, len, sum, err_ptr, NULL); + if (copy_from_user(dst, src, len)) + *err_ptr = -EFAULT; + return csum_partial(dst, len, sum); } /* @@ -171,11 +176,26 @@ static __inline__ unsigned short int csu * Copy and checksum to user */ #define HAVE_CSUM_COPY_USER -static __inline__ unsigned int csum_and_copy_to_user(const char *src, char *dst, +static __inline__ unsigned int direct_csum_and_copy_to_user(const char *src, char *dst, int len, int sum, int *err_ptr) { if (access_ok(VERIFY_WRITE, dst, len)) - return csum_partial_copy_generic(src, dst, len, sum, NULL, err_ptr); + return direct_csum_partial_copy_generic(src, dst, len, sum, NULL, err_ptr); + + if (len) + *err_ptr = -EFAULT; + + return -1; /* invalid checksum */ +} + +static __inline__ unsigned int csum_and_copy_to_user(const char *src, char *dst, + int len, int sum, int *err_ptr) +{ + if (access_ok(VERIFY_WRITE, dst, len)) { + if (copy_to_user(dst, src, len)) + *err_ptr = -EFAULT; + return csum_partial(src, len, sum); + } if (len) *err_ptr = -EFAULT; --- linux/include/asm-i386/desc.h.orig +++ linux/include/asm-i386/desc.h @@ -21,6 +21,9 @@ struct Xgt_desc_struct { extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS]; +extern void trap_init_virtual_IDT(void); +extern void trap_init_virtual_GDT(void); + #define load_TR_desc() __asm__ __volatile__("ltr %%ax"::"a" (GDT_ENTRY_TSS*8)) #define load_LDT_desc() __asm__ __volatile__("lldt %%ax"::"a" (GDT_ENTRY_LDT*8)) @@ -30,6 +33,7 @@ extern struct Xgt_desc_struct idt_descr, */ extern struct desc_struct default_ldt[]; extern void set_intr_gate(unsigned int irq, void * addr); +extern void set_trap_gate(unsigned int n, void *addr); #define _set_tssldt_desc(n,addr,limit,type) \ __asm__ __volatile__ ("movw %w3,0(%2)\n\t" \ @@ -90,31 +94,8 @@ static inline void load_TLS(struct threa #undef C } -static inline void clear_LDT(void) -{ - int cpu = get_cpu(); - - set_ldt_desc(cpu, &default_ldt[0], 5); - load_LDT_desc(); - put_cpu(); -} - -/* - * load one particular LDT into the current CPU - */ -static inline void load_LDT_nolock(mm_context_t *pc, int cpu) -{ - void *segments = pc->ldt; - int count = pc->size; - - if (likely(!count)) { - segments = &default_ldt[0]; - count = 5; - } - - set_ldt_desc(cpu, segments, count); - load_LDT_desc(); -} +extern struct page *default_ldt_page; +extern void load_LDT_nolock(mm_context_t *pc, int cpu); static inline void load_LDT(mm_context_t *pc) { @@ -123,6 +104,6 @@ static inline void load_LDT(mm_context_t put_cpu(); } -#endif /* !__ASSEMBLY__ */ +#endif /* !__ASSEMBLY__ */ #endif --- linux/include/asm-i386/fixmap.h.orig +++ linux/include/asm-i386/fixmap.h @@ -18,17 +18,15 @@ #include #include #include -#ifdef CONFIG_HIGHMEM #include #include -#endif /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at * compile time, but to set the physical address only - * in the boot process. We allocate these special addresses - * from the end of virtual memory (0xfffff000) backwards. + * in the boot process. We allocate these special addresses + * from the end of virtual memory (0xffffe000) backwards. * Also this lets us do fail-safe vmalloc(), we * can guarantee that these special addresses and * vmalloc()-ed addresses never overlap. @@ -41,11 +39,21 @@ * TLB entries of such buffers will not be flushed across * task switches. */ + +/* + * on UP currently we will have no trace of the fixmap mechanizm, + * no page table allocations, etc. This might change in the + * future, say framebuffers for the console driver(s) could be + * fix-mapped? + */ enum fixed_addresses { FIX_HOLE, FIX_VSYSCALL, + FIX_VSTACK_HOLE_1, #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ +#else + FIX_VSTACK_HOLE_2, #endif #ifdef CONFIG_X86_IO_APIC FIX_IO_APIC_BASE_0, @@ -57,16 +65,18 @@ enum fixed_addresses { FIX_LI_PCIA, /* Lithium PCI Bridge A */ FIX_LI_PCIB, /* Lithium PCI Bridge B */ #endif -#ifdef CONFIG_X86_F00F_BUG - FIX_F00F_IDT, /* Virtual mapping for IDT */ -#endif -#ifdef CONFIG_X86_CYCLONE_TIMER + FIX_IDT, + FIX_GDT_1, + FIX_GDT_0, + FIX_TSS_1, + FIX_TSS_0, + FIX_ENTRY_TRAMPOLINE, +#ifdef CONFIG_X86_SUMMIT FIX_CYCLONE_TIMER, /*cyclone timer register*/ + FIX_VSTACK_HOLE_3, #endif -#ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, -#endif #ifdef CONFIG_ACPI_BOOT FIX_ACPI_BEGIN, FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1, @@ -95,12 +105,15 @@ extern void __set_fixmap (enum fixed_add __set_fixmap(idx, 0, __pgprot(0)) /* - * used by vmalloc.c. + * used by vmalloc.c and various other places. * * Leave one empty page between vmalloc'ed areas and * the start of the fixmap. + * + * IMPORTANT: dont change FIXADDR_TOP without adjusting KM_VSTACK0 + * and KM_VSTACK1 so that the virtual stack is 8K aligned. */ -#define FIXADDR_TOP (0xfffff000UL) +#define FIXADDR_TOP (0xffffe000UL) #define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE) --- linux/include/asm-i386/highmem.h.orig +++ linux/include/asm-i386/highmem.h @@ -25,21 +25,22 @@ #include #include +#include + /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pte_t *kmap_pte; -extern pgprot_t kmap_prot; extern pte_t *pkmap_page_table; -extern void kmap_init(void); +extern void kmap_init(void) __init; /* * Right now we initialize only a single pte table. It can be extended * easily, subsequent pte tables have to be allocated in one physical * chunk of RAM. */ -#define PKMAP_BASE (0xff800000UL) +#define PKMAP_BASE (0xff000000UL) +#define NR_SHARED_PMDS ((0xffffffff-PKMAP_BASE+1)/PMD_SIZE) #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 #else --- linux/include/asm-i386/kmap_types.h.orig +++ linux/include/asm-i386/kmap_types.h @@ -3,28 +3,33 @@ #include -#ifdef CONFIG_DEBUG_HIGHMEM -# define D(n) __KM_FENCE_##n , -#else -# define D(n) -#endif - enum km_type { -D(0) KM_BOUNCE_READ, -D(1) KM_SKB_SUNRPC_DATA, -D(2) KM_SKB_DATA_SOFTIRQ, -D(3) KM_USER0, -D(4) KM_USER1, -D(5) KM_BIO_SRC_IRQ, -D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR + /* + * IMPORTANT: dont move these 3 entries, the virtual stack + * must be 8K aligned. + */ + KM_BOUNCE_READ, + KM_VSTACK1, + KM_VSTACK0, + + KM_LDT_PAGE15, + KM_LDT_PAGE0 = KM_LDT_PAGE15 + 16-1, + KM_USER_COPY, + KM_VSTACK_HOLE, + KM_SKB_SUNRPC_DATA, + KM_SKB_DATA_SOFTIRQ, + KM_USER0, + KM_USER1, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, + KM_PTE0, + KM_PTE1, + KM_PTE2, + KM_IRQ0, + KM_IRQ1, + KM_SOFTIRQ0, + KM_SOFTIRQ1, + KM_TYPE_NR }; #undef D --- linux/include/asm-i386/mmu_context.h.orig +++ linux/include/asm-i386/mmu_context.h @@ -29,6 +29,10 @@ static inline void switch_mm(struct mm_s { int cpu = smp_processor_id(); +#ifdef CONFIG_X86_SWITCH_PAGETABLES + if (tsk->mm) + tsk->thread_info->user_pgd = (void *)__pa(tsk->mm->pgd); +#endif if (likely(prev != next)) { /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); @@ -39,12 +43,14 @@ static inline void switch_mm(struct mm_s set_bit(cpu, &next->cpu_vm_mask); /* Re-load page tables */ +#if !defined(CONFIG_X86_SWITCH_PAGETABLES) load_cr3(next->pgd); +#endif /* * load the LDT, if the LDT is different: */ - if (unlikely(prev->context.ldt != next->context.ldt)) + if (unlikely(prev->context.size + next->context.size)) load_LDT_nolock(&next->context, cpu); } #ifdef CONFIG_SMP @@ -56,7 +62,9 @@ static inline void switch_mm(struct mm_s /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload %cr3. */ +#if !defined(CONFIG_X86_SWITCH_PAGETABLES) load_cr3(next->pgd); +#endif load_LDT_nolock(&next->context, cpu); } } @@ -67,6 +75,6 @@ static inline void switch_mm(struct mm_s asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) #define activate_mm(prev, next) \ - switch_mm((prev),(next),NULL) + switch_mm((prev),(next),current) #endif --- linux/include/asm-i386/processor.h.orig +++ linux/include/asm-i386/processor.h @@ -283,7 +283,7 @@ extern unsigned int mca_pentium_flag; /* * User space process size: 3GB (default). */ -#define TASK_SIZE (PAGE_OFFSET) +#define TASK_SIZE (PAGE_OFFSET_USER) /* This decides where the kernel will search for a free chunk of vm * space during mmap's. @@ -388,6 +388,7 @@ struct tss_struct { struct thread_struct { /* cached TLS descriptors. */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + void *stack_page0, *stack_page1; unsigned long esp0; unsigned long eip; unsigned long esp; @@ -467,6 +468,13 @@ extern void prepare_to_copy(struct task_ */ extern int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); +#ifdef CONFIG_X86_HIGH_ENTRY +#define virtual_esp0(tsk) \ + ((unsigned long)(tsk)->thread_info->virtual_stack + ((tsk)->thread.esp0 - (unsigned long)(tsk)->thread_info->real_stack)) +#else +# define virtual_esp0(tsk) ((tsk)->thread.esp0) +#endif + extern unsigned long thread_saved_pc(struct task_struct *tsk); void show_trace(struct task_struct *task, unsigned long *stack); --- linux/include/asm-i386/mmu.h.orig +++ linux/include/asm-i386/mmu.h @@ -8,10 +8,13 @@ * * cpu_vm_mask is used to optimize ldt flushing. */ + +#define MAX_LDT_PAGES 16 + typedef struct { int size; struct semaphore sem; - void *ldt; + struct page *ldt_pages[MAX_LDT_PAGES]; } mm_context_t; #endif --- linux/include/asm-i386/page.h.orig +++ linux/include/asm-i386/page.h @@ -1,6 +1,8 @@ #ifndef _I386_PAGE_H #define _I386_PAGE_H +#include + /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 #define PAGE_SIZE (1UL << PAGE_SHIFT) @@ -9,11 +11,21 @@ #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) -#ifdef __KERNEL__ -#ifndef __ASSEMBLY__ +/* + * these will go away - right now they keep all the + * 4G/4G sub-features split up, for debugging: + */ +#ifdef CONFIG_X86_4G +#define CONFIG_X86_SWITCH_PAGETABLES y +#define CONFIG_X86_4G_VM_LAYOUT y +#define CONFIG_X86_UACCESS_INDIRECT y +#define CONFIG_X86_HIGH_ENTRY y +#endif #include +#ifdef __KERNEL__ +#ifndef __ASSEMBLY__ #ifdef CONFIG_X86_USE_3DNOW #include @@ -88,8 +100,19 @@ typedef struct { unsigned long pgprot; } * * If you want more physical memory than this then see the CONFIG_HIGHMEM4G * and CONFIG_HIGHMEM64G options in the kernel configuration. + * + * Note: on PAE the kernel must never go below 32 MB, we use the + * first 8 entries of the 2-level boot pgd for PAE magic. */ +#ifdef CONFIG_X86_4G_VM_LAYOUT +# define __PAGE_OFFSET (0x02000000) +# define __PAGE_OFFSET_USER (0xff000000) +#else +# define __PAGE_OFFSET (0xc0000000) +# define __PAGE_OFFSET_USER (0xc0000000) +#endif + /* * This much address space is reserved for vmalloc() and iomap() * as well as fixmap mappings. @@ -114,16 +137,11 @@ static __inline__ int get_order(unsigned #endif /* __ASSEMBLY__ */ -#ifdef __ASSEMBLY__ -#define __PAGE_OFFSET (0xC0000000) -#else -#define __PAGE_OFFSET (0xC0000000UL) -#endif - - #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) +#define PAGE_OFFSET_USER ((unsigned long)__PAGE_OFFSET_USER) #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) -#define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define __MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +#define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE)) #define __pa(x) ((unsigned long)(x)-PAGE_OFFSET) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) #define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT) --- linux/include/asm-i386/pgtable.h.orig +++ linux/include/asm-i386/pgtable.h @@ -39,6 +39,30 @@ extern unsigned long empty_zero_page[102 * newer 3-level PAE-mode page tables. */ #ifndef __ASSEMBLY__ + +extern void set_system_gate(unsigned int n, void *addr); +extern void init_entry_mappings(void); +extern void entry_trampoline_setup(void); + +#ifdef CONFIG_X86_HIGH_ENTRY + +extern char entry_tramp_start, entry_tramp_end; + +/* + * Fix up symbol addresses in the trampoline fixmapped just below 4G. + * + * (TODO: do boot-time code fixups not these runtime fixups.) + */ +#define __ENTRY_TRAMP_ADDR(symb) ((void *)(symb) - (void *) &entry_tramp_start + (void *) __fix_to_virt(FIX_ENTRY_TRAMPOLINE)) + +#define ENTRY_TRAMP_ADDR(symb) \ + ({ void *__ret; if ((void *)(symb) >= (void *)&entry_tramp_start && (void *)(symb) < (void *)&entry_tramp_end) __ret = __ENTRY_TRAMP_ADDR(symb); else __ret = (symb); __ret; }) + +#else +# define ENTRY_TRAMP_ADDR(symb) (symb) +# define __ENTRY_TRAMP_ADDR(symb) (symb) +#endif + #ifdef CONFIG_X86_PAE # include @@ -63,7 +87,12 @@ extern void pgtable_cache_init(void); #define PGDIR_SIZE (1UL << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE-1)) -#define USER_PTRS_PER_PGD (TASK_SIZE/PGDIR_SIZE) +#if defined(CONFIG_X86_PAE) && defined(CONFIG_X86_4G_VM_LAYOUT) +# define USER_PTRS_PER_PGD 4 +#else +# define USER_PTRS_PER_PGD ((TASK_SIZE/PGDIR_SIZE) + ((TASK_SIZE % PGDIR_SIZE) + PGDIR_SIZE-1)/PGDIR_SIZE) +#endif + #define FIRST_USER_PGD_NR 0 #define USER_PGD_PTRS (PAGE_OFFSET >> PGDIR_SHIFT) @@ -234,6 +263,7 @@ static inline void ptep_mkdirty(pte_t *p #define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot)) #define mk_pte_huge(entry) ((entry).pte_low |= _PAGE_PRESENT | _PAGE_PSE) +#define mk_pte_phys(physpage, pgprot) pfn_pte((physpage) >> PAGE_SHIFT, pgprot) static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { --- linux/include/asm-i386/thread_info.h.orig +++ linux/include/asm-i386/thread_info.h @@ -33,22 +33,26 @@ struct thread_info { 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ - struct restart_block restart_block; + void *real_stack, *virtual_stack, *user_pgd; __u8 supervisor_stack[0]; + struct restart_block restart_block; }; #else /* !__ASSEMBLY__ */ /* offsets into the thread_info struct for assembly code access */ -#define TI_TASK 0x00000000 -#define TI_EXEC_DOMAIN 0x00000004 -#define TI_FLAGS 0x00000008 -#define TI_STATUS 0x0000000C -#define TI_CPU 0x00000010 -#define TI_PRE_COUNT 0x00000014 -#define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_TASK 0x00000000 +#define TI_EXEC_DOMAIN 0x00000004 +#define TI_FLAGS 0x00000008 +#define TI_STATUS 0x0000000C +#define TI_CPU 0x00000010 +#define TI_PRE_COUNT 0x00000014 +#define TI_ADDR_LIMIT 0x00000018 +#define TI_REAL_STACK 0x0000001C +#define TI_VIRT_STACK 0x00000020 +#define TI_USER_PGD 0x00000024 +#define TI_RESTART_BLOCK 0x00000028 #endif @@ -61,7 +65,7 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ +#define INIT_THREAD_INFO(tsk, thread_info) \ { \ .task = &tsk, \ .exec_domain = &default_exec_domain, \ @@ -72,6 +76,7 @@ struct thread_info { .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ + .real_stack = &thread_info, \ } #define init_thread_info (init_thread_union.thread_info) --- linux/include/asm-i386/tlbflush.h.orig +++ linux/include/asm-i386/tlbflush.h @@ -85,22 +85,28 @@ extern unsigned long pgkern_mask; static inline void flush_tlb_mm(struct mm_struct *mm) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (mm == current->active_mm) __flush_tlb(); +#endif } static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (vma->vm_mm == current->active_mm) __flush_tlb_one(addr); +#endif } static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (vma->vm_mm == current->active_mm) __flush_tlb(); +#endif } #else @@ -111,11 +117,10 @@ static inline void flush_tlb_range(struc __flush_tlb() extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); extern void flush_tlb_mm(struct mm_struct *); extern void flush_tlb_page(struct vm_area_struct *, unsigned long); -#define flush_tlb() flush_tlb_current_task() +#define flush_tlb() flush_tlb_all() static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long start, unsigned long end) { --- linux/include/asm-i386/string.h.orig +++ linux/include/asm-i386/string.h @@ -56,6 +56,29 @@ __asm__ __volatile__( return dest; } +/* + * This is a more generic variant of strncpy_count() suitable for + * implementing string-access routines with all sorts of return + * code semantics. It's used by mm/usercopy.c. + */ +static inline size_t strncpy_count(char * dest,const char *src,size_t count) +{ + __asm__ __volatile__( + + "1:\tdecl %0\n\t" + "js 2f\n\t" + "lodsb\n\t" + "stosb\n\t" + "testb %%al,%%al\n\t" + "jne 1b\n\t" + "2:" + "incl %0" + : "=c" (count) + :"S" (src),"D" (dest),"0" (count) : "memory"); + + return count; +} + #define __HAVE_ARCH_STRCAT static inline char * strcat(char * dest,const char * src) { --- linux/include/asm-i386/uaccess.h.orig +++ linux/include/asm-i386/uaccess.h @@ -26,7 +26,7 @@ #define KERNEL_DS MAKE_MM_SEG(0xFFFFFFFFUL) -#define USER_DS MAKE_MM_SEG(PAGE_OFFSET) +#define USER_DS MAKE_MM_SEG(PAGE_OFFSET_USER) #define get_ds() (KERNEL_DS) #define get_fs() (current_thread_info()->addr_limit) @@ -149,6 +149,45 @@ extern void __get_user_4(void); :"=a" (ret),"=d" (x) \ :"0" (ptr)) +extern int get_user_size(unsigned int size, void *val, const void *ptr); +extern int put_user_size(unsigned int size, const void *val, void *ptr); +extern int zero_user_size(unsigned int size, void *ptr); +extern int copy_str_fromuser_size(unsigned int size, void *val, const void *ptr); +extern int strlen_fromuser_size(unsigned int size, const void *ptr); + + +# define indirect_get_user(x,ptr) \ +({ int __ret_gu,__val_gu; \ + __typeof__(ptr) __ptr_gu = (ptr); \ + __ret_gu = get_user_size(sizeof(*__ptr_gu), &__val_gu,__ptr_gu) ? -EFAULT : 0;\ + (x) = (__typeof__(*__ptr_gu))__val_gu; \ + __ret_gu; \ +}) +#define indirect_put_user(x,ptr) \ +({ \ + __typeof__(*(ptr)) *__ptr_pu = (ptr), __x_pu = (x); \ + put_user_size(sizeof(*__ptr_pu), &__x_pu, __ptr_pu) ? -EFAULT : 0; \ +}) +#define __indirect_put_user indirect_put_user +#define __indirect_get_user indirect_get_user + +#define indirect_copy_from_user(to,from,n) get_user_size(n,to,from) +#define indirect_copy_to_user(to,from,n) put_user_size(n,from,to) + +#define __indirect_copy_from_user indirect_copy_from_user +#define __indirect_copy_to_user indirect_copy_to_user + +#define indirect_strncpy_from_user(dst, src, count) \ + copy_str_fromuser_size(count, dst, src) + +extern int strlen_fromuser_size(unsigned int size, const void *ptr); +#define indirect_strnlen_user(str, n) strlen_fromuser_size(n, str) +#define indirect_strlen_user(str) indirect_strnlen_user(str, ~0UL >> 1) + +extern int zero_user_size(unsigned int size, void *ptr); + +#define indirect_clear_user(mem, len) zero_user_size(len, mem) +#define __indirect_clear_user clear_user /* Careful: we have to cast the result to the type of the pointer for sign reasons */ /** @@ -168,7 +207,7 @@ extern void __get_user_4(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define get_user(x,ptr) \ +#define direct_get_user(x,ptr) \ ({ int __ret_gu,__val_gu; \ switch(sizeof (*(ptr))) { \ case 1: __get_user_x(1,__ret_gu,__val_gu,ptr); break; \ @@ -198,7 +237,7 @@ extern void __put_user_bad(void); * * Returns zero on success, or -EFAULT on error. */ -#define put_user(x,ptr) \ +#define direct_put_user(x,ptr) \ __put_user_check((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) @@ -222,7 +261,7 @@ extern void __put_user_bad(void); * Returns zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ -#define __get_user(x,ptr) \ +#define __direct_get_user(x,ptr) \ __get_user_nocheck((x),(ptr),sizeof(*(ptr))) @@ -245,7 +284,7 @@ extern void __put_user_bad(void); * * Returns zero on success, or -EFAULT on error. */ -#define __put_user(x,ptr) \ +#define __direct_put_user(x,ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x),(ptr),sizeof(*(ptr))) #define __put_user_nocheck(x,ptr,size) \ @@ -395,7 +434,7 @@ unsigned long __copy_from_user_ll(void * * On success, this will be zero. */ static inline unsigned long -__copy_to_user(void __user *to, const void *from, unsigned long n) +__direct_copy_to_user(void __user *to, const void *from, unsigned long n) { if (__builtin_constant_p(n)) { unsigned long ret; @@ -433,7 +472,7 @@ __copy_to_user(void __user *to, const vo * data to the requested size using zero bytes. */ static inline unsigned long -__copy_from_user(void *to, const void __user *from, unsigned long n) +__direct_copy_from_user(void *to, const void __user *from, unsigned long n) { if (__builtin_constant_p(n)) { unsigned long ret; @@ -467,10 +506,10 @@ __copy_from_user(void *to, const void __ * On success, this will be zero. */ static inline unsigned long -copy_to_user(void __user *to, const void *from, unsigned long n) +direct_copy_to_user(void __user *to, const void *from, unsigned long n) { if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); + n = __direct_copy_to_user(to, from, n); return n; } @@ -491,10 +530,10 @@ copy_to_user(void __user *to, const void * data to the requested size using zero bytes. */ static inline unsigned long -copy_from_user(void *to, const void __user *from, unsigned long n) +direct_copy_from_user(void *to, const void __user *from, unsigned long n) { if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); + n = __direct_copy_from_user(to, from, n); else memset(to, 0, n); return n; @@ -517,10 +556,68 @@ long __strncpy_from_user(char *dst, cons * If there is a limit on the length of a valid string, you may wish to * consider using strnlen_user() instead. */ -#define strlen_user(str) strnlen_user(str, ~0UL >> 1) -long strnlen_user(const char __user *str, long n); -unsigned long clear_user(void __user *mem, unsigned long len); -unsigned long __clear_user(void __user *mem, unsigned long len); +long direct_strncpy_from_user(char *dst, const char *src, long count); +long __direct_strncpy_from_user(char *dst, const char *src, long count); +#define direct_strlen_user(str) direct_strnlen_user(str, ~0UL >> 1) +long direct_strnlen_user(const char *str, long n); +unsigned long direct_clear_user(void *mem, unsigned long len); +unsigned long __direct_clear_user(void *mem, unsigned long len); + +extern int indirect_uaccess; + +#ifdef CONFIG_X86_UACCESS_INDIRECT + +/* + * Return code and zeroing semantics: + + __clear_user 0 <-> bytes not done + clear_user 0 <-> bytes not done + __copy_to_user 0 <-> bytes not done + copy_to_user 0 <-> bytes not done + __copy_from_user 0 <-> bytes not done, zero rest + copy_from_user 0 <-> bytes not done, zero rest + __get_user 0 <-> -EFAULT + get_user 0 <-> -EFAULT + __put_user 0 <-> -EFAULT + put_user 0 <-> -EFAULT + strlen_user strlen + 1 <-> 0 + strnlen_user strlen + 1 (or n+1) <-> 0 + strncpy_from_user strlen (or n) <-> -EFAULT + + */ + +#define __clear_user(mem,len) __indirect_clear_user(mem,len) +#define clear_user(mem,len) indirect_clear_user(mem,len) +#define __copy_to_user(to,from,n) __indirect_copy_to_user(to,from,n) +#define copy_to_user(to,from,n) indirect_copy_to_user(to,from,n) +#define __copy_from_user(to,from,n) __indirect_copy_from_user(to,from,n) +#define copy_from_user(to,from,n) indirect_copy_from_user(to,from,n) +#define __get_user(val,ptr) __indirect_get_user(val,ptr) +#define get_user(val,ptr) indirect_get_user(val,ptr) +#define __put_user(val,ptr) __indirect_put_user(val,ptr) +#define put_user(val,ptr) indirect_put_user(val,ptr) +#define strlen_user(str) indirect_strlen_user(str) +#define strnlen_user(src,count) indirect_strnlen_user(src,count) +#define strncpy_from_user(dst,src,count) \ + indirect_strncpy_from_user(dst,src,count) + +#else + +#define __clear_user __direct_clear_user +#define clear_user direct_clear_user +#define __copy_to_user __direct_copy_to_user +#define copy_to_user direct_copy_to_user +#define __copy_from_user __direct_copy_from_user +#define copy_from_user direct_copy_from_user +#define __get_user __direct_get_user +#define get_user direct_get_user +#define __put_user __direct_put_user +#define put_user direct_put_user +#define strlen_user direct_strlen_user +#define strnlen_user direct_strnlen_user +#define strncpy_from_user direct_strncpy_from_user + +#endif /* CONFIG_X86_UACCESS_INDIRECT */ #endif /* __i386_UACCESS_H */ --- linux/arch/i386/kernel/cpu/intel.c.orig +++ linux/arch/i386/kernel/cpu/intel.c @@ -8,11 +8,11 @@ #include #include #include +#include #include "cpu.h" static int disable_P4_HT __initdata = 0; -extern int trap_init_f00f_bug(void); #ifdef CONFIG_X86_INTEL_USERCOPY /* @@ -165,7 +165,7 @@ static void __init init_intel(struct cpu c->f00f_bug = 1; if ( !f00f_workaround_enabled ) { - trap_init_f00f_bug(); + trap_init_virtual_IDT(); printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); f00f_workaround_enabled = 1; } --- linux/arch/i386/kernel/cpu/common.c.orig +++ linux/arch/i386/kernel/cpu/common.c @@ -497,16 +497,20 @@ void __init cpu_init (void) BUG(); enter_lazy_tlb(&init_mm, current); - load_esp0(t, thread->esp0); - set_tss_desc(cpu,t); + t->esp0 = thread->esp0; + set_tss_desc(cpu, t); cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; load_TR_desc(); - load_LDT(&init_mm.context); + if (cpu) + load_LDT(&init_mm.context); /* Set up doublefault TSS pointer in the GDT */ __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); cpu_gdt_table[cpu][GDT_ENTRY_DOUBLEFAULT_TSS].b &= 0xfffffdff; + if (cpu) + trap_init_virtual_GDT(); + /* Clear %fs and %gs. */ asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); --- linux/arch/i386/kernel/entry.S.orig +++ linux/arch/i386/kernel/entry.S @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include "irq_vectors.h" @@ -85,6 +86,151 @@ TSS_ESP0_OFFSET = (4 - 0x200) #define resume_kernel restore_all #endif +#ifdef CONFIG_X86_HIGH_ENTRY + +#define call_SYMBOL_NAME_ABS(X) movl $X, %edi; call *%edi + +#define __SAVE_ALL \ + cld; \ + pushl %es; \ + pushl %ds; \ + pushl %eax; \ + pushl %ebp; \ + pushl %edi; \ + pushl %esi; \ + pushl %edx; \ + pushl %ecx; \ + pushl %ebx; \ + movl $(__USER_DS),%edx; \ + movl %edx,%ds; \ + movl %edx,%es; + +#define __RESTORE_ALL \ + popl %ebx; \ + popl %ecx; \ + popl %edx; \ + popl %esi; \ + popl %edi; \ + popl %ebp; \ + popl %eax; \ +111: popl %ds; \ +222: popl %es; \ + addl $4,%esp; \ +333: iret; \ + \ +444: movl $0,(%esp); \ + jmp 111b; \ +555: movl $0,(%esp); \ + jmp 222b; \ +.section .fixup,"ax"; \ +666: pushl %ss; \ + popl %ds; \ + pushl %ss; \ + popl %es; \ + pushl $11; \ + call_SYMBOL_NAME_ABS(do_exit); \ +.previous; \ +.section __ex_table,"a"; \ + .align 4; \ + .long 111b,444b; \ + .long 222b,555b; \ + .long 333b,666b; \ +.previous + +/* clobbers edx, ebx */ + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + +#define __SWITCH_TO_KERNEL_PGD \ + movl %cr3, %edx; \ + cmpl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + jz 1f; \ + movl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + movl %edx, %cr3; \ +1: + +#else +#define __SWITCH_TO_KERNEL_PGD \ + movl %cr3, %edx; \ + cmpl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + jz 1f; \ + movl $swapper_pg_dir-__PAGE_OFFSET, %edx; \ + /* movl %edx, %cr3; */ \ +1: + +#endif + + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + +#define __SWITCH_TO_USER_PGD \ + movl %ecx, %cr3; + +#else + +#define __SWITCH_TO_USER_PGD \ + /* movl %ecx, %cr3; */ +#endif + +#define __SWITCH \ + \ + __SWITCH_TO_KERNEL_PGD \ + \ + /* load the real stack - keep the offset */ \ + \ + GET_THREAD_INFO(%ebx); \ + movl TI_REAL_STACK(%ebx), %edx; \ + movl %esp, %ebx; \ + andl $0x1fff, %ebx; \ + orl %ebx, %edx; \ + movl %edx, %esp; + +#define SAVE_ALL \ + __SAVE_ALL; \ + __SWITCH + +#define RESTORE_ALL \ + /* interrupted the user return path? */ \ + movl return_path_start, %eax; \ + cmpl %eax, EIP(%esp); \ + jb 33f; /* nope - continue with user check */ \ + movl return_path_end, %eax; \ + cmpl %eax, EIP(%esp); \ + jb 22f; /* yes - switch to virtual stack */ \ +33: \ + /* return to userspace? */ \ + \ + movl EFLAGS(%esp),%ecx; \ + movb CS(%esp),%cl; \ + testl $(VM_MASK | 3),%ecx; \ + jz 2f; \ +22: \ + /* switch to the virtual stack, then switch */ \ + /* userspace pagetable. */ \ + \ + GET_THREAD_INFO(%esi); \ + movl TI_VIRT_STACK(%esi), %edx; \ + movl TI_USER_PGD(%esi), %ecx; \ +.globl return_path_start_marker; \ +return_path_start_marker: \ + movl %esp, %ebx; \ + andl $0x1fff, %ebx; \ + orl %ebx, %edx; \ + movl %esp, %eax; \ + movl %edx, %esp; \ + \ + __SWITCH_TO_USER_PGD \ + __RESTORE_ALL; \ +.globl return_path_end_marker; \ +return_path_end_marker: \ +2: \ + __RESTORE_ALL; + +#else /* !CONFIG_X86_HIGH_ENTRY */ + +#define call_SYMBOL_NAME_ABS(X) call X + +#define __SWITCH #define SAVE_ALL \ cld; \ pushl %es; \ @@ -143,44 +289,13 @@ TSS_ESP0_OFFSET = (4 - 0x200) .long 1b,2b; \ .previous +#endif - -ENTRY(lcall7) - pushfl # We get a different stack layout with call - # gates, which has to be cleaned up later.. - pushl %eax - SAVE_ALL - movl %esp, %ebp - pushl %ebp - pushl $0x7 -do_lcall: - movl EIP(%ebp), %eax # due to call gates, this is eflags, not eip.. - movl CS(%ebp), %edx # this is eip.. - movl EFLAGS(%ebp), %ecx # and this is cs.. - movl %eax,EFLAGS(%ebp) # - movl %edx,EIP(%ebp) # Now we move them to their "normal" places - movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO - movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain - call *4(%edx) # Call the lcall7 handler for the domain - addl $4, %esp - popl %eax - jmp resume_userspace - -ENTRY(lcall27) - pushfl # We get a different stack layout with call - # gates, which has to be cleaned up later.. - pushl %eax - SAVE_ALL - movl %esp, %ebp - pushl %ebp - pushl $0x27 - jmp do_lcall - +.section .entry.text,"ax" ENTRY(ret_from_fork) pushl %eax - call schedule_tail + call_SYMBOL_NAME_ABS(schedule_tail) GET_THREAD_INFO(%ebp) popl %eax jmp syscall_exit @@ -224,7 +339,7 @@ need_resched: jz restore_all movl $PREEMPT_ACTIVE,TI_PRE_COUNT(%ebp) sti - call schedule + call_SYMBOL_NAME_ABS(schedule) movl $0,TI_PRE_COUNT(%ebp) cli jmp need_resched @@ -236,6 +351,7 @@ need_resched: # sysenter call handler stub ENTRY(sysenter_entry) movl TSS_ESP0_OFFSET(%esp),%esp + sysenter_past_esp: sti pushl $(__USER_DS) @@ -248,7 +364,7 @@ sysenter_past_esp: * Load the potential sixth argument from user stack. * Careful about security. */ - cmpl $__PAGE_OFFSET-3,%ebp + cmpl $__PAGE_OFFSET_USER-3,%ebp jae syscall_fault 1: movl (%ebp),%ebp .section __ex_table,"a" @@ -270,9 +386,27 @@ sysenter_past_esp: movl TI_FLAGS(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx jne syscall_exit_work + +#ifdef CONFIG_X86_SWITCH_PAGETABLES + +#if 0 + GET_THREAD_INFO(%esi) + movl TI_VIRT_STACK(%esi), %edx + movl TI_USER_PGD(%esi), %ecx + movl %esp, %ebx + andl $0x1fff, %ebx + orl %ebx, %edx + movl %edx, %esp + movl %ecx, %cr3 +#endif + +#endif + /* if something modifies registers it must also disable sysexit */ + movl EIP(%esp), %edx movl OLDESP(%esp), %ecx + sti sysexit @@ -306,7 +440,7 @@ work_pending: testb $_TIF_NEED_RESCHED, %cl jz work_notifysig work_resched: - call schedule + call_SYMBOL_NAME_ABS(schedule) cli # make sure we don't miss an interrupt # setting need_resched or sigpending # between sampling and the iret @@ -324,17 +458,17 @@ work_notifysig: # deal with pending s jne work_notifysig_v86 # returning to kernel-space or # vm86-space xorl %edx, %edx - call do_notify_resume + call_SYMBOL_NAME_ABS(do_notify_resume) jmp restore_all ALIGN work_notifysig_v86: pushl %ecx - call save_v86_state + call_SYMBOL_NAME_ABS(save_v86_state) popl %ecx movl %eax, %esp xorl %edx, %edx - call do_notify_resume + call_SYMBOL_NAME_ABS(do_notify_resume) jmp restore_all # perform syscall exit tracing @@ -343,7 +477,7 @@ syscall_trace_entry: movl $-ENOSYS,EAX(%esp) movl %esp, %eax xorl %edx,%edx - call do_syscall_trace + call_SYMBOL_NAME_ABS(do_syscall_trace) movl ORIG_EAX(%esp), %eax cmpl $(nr_syscalls), %eax jnae syscall_call @@ -358,7 +492,7 @@ syscall_exit_work: # schedule() instead movl %esp, %eax movl $1, %edx - call do_syscall_trace + call_SYMBOL_NAME_ABS(do_syscall_trace) jmp resume_userspace ALIGN @@ -380,7 +514,7 @@ syscall_badsys: */ .data ENTRY(interrupt) -.text +.previous vector=0 ENTRY(irq_entries_start) @@ -390,21 +524,21 @@ ENTRY(irq_entries_start) jmp common_interrupt .data .long 1b -.text +.previous vector=vector+1 .endr ALIGN common_interrupt: SAVE_ALL - call do_IRQ + call_SYMBOL_NAME_ABS(do_IRQ) jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + call_SYMBOL_NAME_ABS(smp_/**/name); \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -431,13 +565,19 @@ error_code: movl ES(%esp), %edi # get the function address movl %eax, ORIG_EAX(%esp) movl %ecx, ES(%esp) - movl %esp, %edx pushl %esi # push the error code - pushl %edx # push the pt_regs pointer movl $(__USER_DS), %edx movl %edx, %ds movl %edx, %es + +/* clobbers edx, ebx */ + __SWITCH + + leal 4(%esp), %edx # prepare pt_regs + pushl %edx # push pt_regs + call *%edi + addl $8, %esp jmp ret_from_exception @@ -458,11 +598,11 @@ ENTRY(device_not_available) testl $0x4, %eax # EM (math emulation bit) jne device_not_available_emulate preempt_stop - call math_state_restore + call_SYMBOL_NAME_ABS(math_state_restore) jmp ret_from_exception device_not_available_emulate: pushl $0 # temporary storage for ORIG_EIP - call math_emulate + call_SYMBOL_NAME_ABS(math_emulate) addl $4, %esp jmp ret_from_exception @@ -525,9 +665,9 @@ nmi_stack_correct: movl %esp, %edx pushl $0 pushl %edx - call do_nmi + call_SYMBOL_NAME_ABS(do_nmi) addl $8, %esp - RESTORE_ALL + jmp restore_all nmi_stack_fixup: FIX_STACK(12,nmi_stack_correct, 1) @@ -604,6 +744,8 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code +.previous + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ --- linux/arch/i386/kernel/entry_trampoline.c.orig +++ linux/arch/i386/kernel/entry_trampoline.c @@ -0,0 +1,75 @@ +/* + * linux/arch/i386/kernel/entry_trampoline.c + * + * (C) Copyright 2003 Ingo Molnar + * + * This file contains the needed support code for 4GB userspace + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void *return_path_start, *return_path_end; + +extern char return_path_start_marker, return_path_end_marker; + +void __init init_entry_mappings(void) +{ +#ifdef CONFIG_X86_HIGH_ENTRY + void *tramp; + + /* + * We need a high IDT and GDT for the 4G/4G split: + */ + trap_init_virtual_IDT(); + + __set_fixmap(FIX_ENTRY_TRAMPOLINE, __pa((unsigned long)&entry_tramp_start), PAGE_KERNEL); + tramp = (void *)fix_to_virt(FIX_ENTRY_TRAMPOLINE); + + printk("mapped 4G/4G trampoline to %p.\n", tramp); + /* + * Virtual kernel stack: + */ + BUG_ON(__kmap_atomic_vaddr(KM_VSTACK0) & 8191); + BUG_ON(sizeof(struct desc_struct)*NR_CPUS*GDT_ENTRIES > 2*PAGE_SIZE); + BUG_ON((unsigned int)&entry_tramp_end - (unsigned int)&entry_tramp_start > PAGE_SIZE); + + /* + * set up the initial thread's virtual stack related + * fields: + */ + current->thread.stack_page0 = virt_to_page((char *)current->thread_info); + current->thread.stack_page1 = virt_to_page((char *)current->thread_info + PAGE_SIZE); + current->thread_info->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK0); + + __kunmap_atomic_type(KM_VSTACK0); + __kunmap_atomic_type(KM_VSTACK1); + __kmap_atomic(current->thread.stack_page0, KM_VSTACK0); + __kmap_atomic(current->thread.stack_page1, KM_VSTACK1); + + return_path_start = ENTRY_TRAMP_ADDR(&return_path_start_marker); + return_path_end = ENTRY_TRAMP_ADDR(&return_path_end_marker); +#endif + current->thread_info->real_stack = (void *)current->thread_info; + current->thread_info->user_pgd = NULL; + current->thread.esp0 = (unsigned long)current->thread_info->real_stack + THREAD_SIZE; +} + + + +void __init entry_trampoline_setup(void) +{ + /* + * old IRQ entries set up by the boot code will still hang + * around - they are a sign of hw trouble anyway, now they'll + * produce a double fault message. + */ + trap_init_virtual_GDT(); +} --- linux/arch/i386/kernel/reboot.c.orig +++ linux/arch/i386/kernel/reboot.c @@ -155,6 +155,12 @@ void machine_real_restart(unsigned char from the kernel segment. This assumes the kernel segment starts at virtual address PAGE_OFFSET. */ + /* + * NOTE: this is a wrong 4G/4G PAE assumption. But it will triple + * fault the CPU (ie. reboot it) in a guaranteed way so we dont + * lose anything but the ability to warm-reboot. (which doesnt + * work on those big boxes using 4G/4G PAE anyway.) + */ memcpy (swapper_pg_dir, swapper_pg_dir + USER_PGD_PTRS, sizeof (swapper_pg_dir [0]) * KERNEL_PGD_PTRS); --- linux/arch/i386/kernel/head.S.orig +++ linux/arch/i386/kernel/head.S @@ -377,23 +377,27 @@ cpu_gdt_descr: .fill NR_CPUS-1,8,0 # space for the other GDT descriptors /* - * This is initialized to create an identity-mapping at 0-8M (for bootup - * purposes) and another mapping of the 0-8M area at virtual address + * This is initialized to create an identity-mapping at 0-16M (for bootup + * purposes) and another mapping of the 0-16M area at virtual address * PAGE_OFFSET. */ .org 0x1000 ENTRY(swapper_pg_dir) .long 0x00102007 .long 0x00103007 - .fill BOOT_USER_PGD_PTRS-2,4,0 - /* default: 766 entries */ + .long 0x00104007 + .long 0x00105007 + .fill BOOT_USER_PGD_PTRS-4,4,0 + /* default: 764 entries */ .long 0x00102007 .long 0x00103007 - /* default: 254 entries */ - .fill BOOT_KERNEL_PGD_PTRS-2,4,0 + .long 0x00104007 + .long 0x00105007 + /* default: 252 entries */ + .fill BOOT_KERNEL_PGD_PTRS-4,4,0 /* - * The page tables are initialized to only 8MB here - the final page + * The page tables are initialized to only 16MB here - the final page * tables are set up later depending on memory size. */ .org 0x2000 @@ -402,15 +406,21 @@ ENTRY(pg0) .org 0x3000 ENTRY(pg1) +.org 0x4000 +ENTRY(pg2) + +.org 0x5000 +ENTRY(pg3) + /* * empty_zero_page must immediately follow the page tables ! (The * initialization loop counts until empty_zero_page) */ -.org 0x4000 +.org 0x6000 ENTRY(empty_zero_page) -.org 0x5000 +.org 0x7000 /* * Real beginning of normal "text" segment @@ -419,12 +429,12 @@ ENTRY(stext) ENTRY(_stext) /* - * This starts the data section. Note that the above is all - * in the text section because it has alignment requirements - * that we cannot fulfill any other way. + * This starts the data section. */ .data +.align 4096 + /* * The Global Descriptor Table contains 28 quadwords, per-CPU. */ @@ -439,7 +449,9 @@ ENTRY(boot_gdt_table) .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */ #endif - .align L1_CACHE_BYTES + +.align 4096 + ENTRY(cpu_gdt_table) .quad 0x0000000000000000 /* NULL descriptor */ .quad 0x0000000000000000 /* 0x0b reserved */ --- linux/arch/i386/kernel/i386_ksyms.c.orig +++ linux/arch/i386/kernel/i386_ksyms.c @@ -100,7 +100,6 @@ EXPORT_SYMBOL_NOVERS(__down_failed_inter EXPORT_SYMBOL_NOVERS(__down_failed_trylock); EXPORT_SYMBOL_NOVERS(__up_wakeup); /* Networking helper routines. */ -EXPORT_SYMBOL(csum_partial_copy_generic); /* Delay loops */ EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(__udelay); @@ -114,13 +113,17 @@ EXPORT_SYMBOL_NOVERS(__get_user_4); EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strstr); +#if !defined(CONFIG_X86_UACCESS_INDIRECT) EXPORT_SYMBOL(strncpy_from_user); -EXPORT_SYMBOL(__strncpy_from_user); +EXPORT_SYMBOL(__direct_strncpy_from_user); EXPORT_SYMBOL(clear_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__copy_from_user_ll); EXPORT_SYMBOL(__copy_to_user_ll); EXPORT_SYMBOL(strnlen_user); +#else /* CONFIG_X86_UACCESS_INDIRECT */ +EXPORT_SYMBOL(direct_csum_partial_copy_generic); +#endif EXPORT_SYMBOL(dma_alloc_coherent); EXPORT_SYMBOL(dma_free_coherent); --- linux/arch/i386/kernel/init_task.c.orig +++ linux/arch/i386/kernel/init_task.c @@ -23,7 +23,7 @@ struct mm_struct init_mm = INIT_MM(init_ */ union thread_union init_thread_union __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; + { INIT_THREAD_INFO(init_task, init_thread_union) }; /* * Initial task structure. @@ -39,5 +39,5 @@ struct task_struct init_task = INIT_TASK * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -struct tss_struct init_tss[NR_CPUS] __cacheline_aligned = { [0 ... NR_CPUS-1] = INIT_TSS }; +struct tss_struct init_tss[NR_CPUS] __attribute__((__section__(".data.tss"))) = { [0 ... NR_CPUS-1] = INIT_TSS }; --- linux/arch/i386/kernel/ldt.c.orig +++ linux/arch/i386/kernel/ldt.c @@ -2,7 +2,7 @@ * linux/kernel/ldt.c * * Copyright (C) 1992 Krishna Balasubramanian and Linus Torvalds - * Copyright (C) 1999 Ingo Molnar + * Copyright (C) 1999, 2003 Ingo Molnar */ #include @@ -18,6 +18,8 @@ #include #include #include +#include +#include #ifdef CONFIG_SMP /* avoids "defined but not used" warnig */ static void flush_ldt(void *null) @@ -29,57 +31,48 @@ static void flush_ldt(void *null) static int alloc_ldt(mm_context_t *pc, int mincount, int reload) { - void *oldldt; - void *newldt; - int oldsize; + int oldsize, newsize, i; if (mincount <= pc->size) return 0; + /* + * LDT got larger - reallocate if necessary. + */ oldsize = pc->size; mincount = (mincount+511)&(~511); - if (mincount*LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount*LDT_ENTRY_SIZE); - else - newldt = kmalloc(mincount*LDT_ENTRY_SIZE, GFP_KERNEL); - - if (!newldt) - return -ENOMEM; - - if (oldsize) - memcpy(newldt, pc->ldt, oldsize*LDT_ENTRY_SIZE); - oldldt = pc->ldt; - memset(newldt+oldsize*LDT_ENTRY_SIZE, 0, (mincount-oldsize)*LDT_ENTRY_SIZE); - pc->ldt = newldt; - wmb(); + newsize = mincount*LDT_ENTRY_SIZE; + for (i = 0; i < newsize; i += PAGE_SIZE) { + int nr = i/PAGE_SIZE; + BUG_ON(i >= 64*1024); + if (!pc->ldt_pages[nr]) { + pc->ldt_pages[nr] = alloc_page(GFP_HIGHUSER); + if (!pc->ldt_pages[nr]) + return -ENOMEM; + clear_highpage(pc->ldt_pages[nr]); + } + } pc->size = mincount; - wmb(); - if (reload) { -#ifdef CONFIG_SMP - preempt_disable(); load_LDT(pc); - if (current->mm->cpu_vm_mask != (1 << smp_processor_id())) +#ifdef CONFIG_SMP + if (current->mm->cpu_vm_mask != (1< PAGE_SIZE) - vfree(oldldt); - else - kfree(oldldt); - } return 0; } static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { - int err = alloc_ldt(new, old->size, 0); - if (err < 0) + int i, err, size = old->size, nr_pages = (size*LDT_ENTRY_SIZE + PAGE_SIZE-1)/PAGE_SIZE; + + err = alloc_ldt(new, size, 0); + if (err < 0) { + new->size = 0; return err; - memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); + } + for (i = 0; i < nr_pages; i++) + copy_user_highpage(new->ldt_pages[i], old->ldt_pages[i], 0); return 0; } @@ -94,6 +87,7 @@ int init_new_context(struct task_struct init_MUTEX(&mm->context.sem); mm->context.size = 0; + memset(mm->context.ldt_pages, 0, sizeof(struct page *) * MAX_LDT_PAGES); old_mm = current->mm; if (old_mm && old_mm->context.size > 0) { down(&old_mm->context.sem); @@ -105,23 +99,21 @@ int init_new_context(struct task_struct /* * No need to lock the MM as we are the last user + * Do not touch the ldt register, we are already + * in the next thread. */ void destroy_context(struct mm_struct *mm) { - if (mm->context.size) { - if (mm == current->active_mm) - clear_LDT(); - if (mm->context.size*LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(mm->context.ldt); - else - kfree(mm->context.ldt); - mm->context.size = 0; - } + int i, nr_pages = (mm->context.size*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) + __free_page(mm->context.ldt_pages[i]); + mm->context.size = 0; } static int read_ldt(void __user * ptr, unsigned long bytecount) { - int err; + int err, i; unsigned long size; struct mm_struct * mm = current->mm; @@ -136,8 +128,25 @@ static int read_ldt(void __user * ptr, u size = bytecount; err = 0; - if (copy_to_user(ptr, mm->context.ldt, size)) - err = -EFAULT; + /* + * This is necessary just in case we got here straight from a + * context-switch where the ptes were set but no tlb flush + * was done yet. We rather avoid doing a TLB flush in the + * context-switch path and do it here instead. + */ + __flush_tlb_global(); + + for (i = 0; i < size; i += PAGE_SIZE) { + int nr = i / PAGE_SIZE, bytes; + char *kaddr = kmap(mm->context.ldt_pages[nr]); + + bytes = size - i; + if (bytes > PAGE_SIZE) + bytes = PAGE_SIZE; + if (copy_to_user(ptr + i, kaddr, size - i)) + err = -EFAULT; + kunmap(mm->context.ldt_pages[nr]); + } up(&mm->context.sem); if (err < 0) return err; @@ -156,7 +165,7 @@ static int read_default_ldt(void __user err = 0; address = &default_ldt[0]; - size = 5*sizeof(struct desc_struct); + size = 5*LDT_ENTRY_SIZE; if (size > bytecount) size = bytecount; @@ -198,7 +207,15 @@ static int write_ldt(void __user * ptr, goto out_unlock; } - lp = (__u32 *) ((ldt_info.entry_number << 3) + (char *) mm->context.ldt); + /* + * No rescheduling allowed from this point to the install. + * + * We do a TLB flush for the same reason as in the read_ldt() path. + */ + preempt_disable(); + __flush_tlb_global(); + lp = (__u32 *) ((ldt_info.entry_number << 3) + + (char *) __kmap_atomic_vaddr(KM_LDT_PAGE0)); /* Allow LDTs to be cleared by the user. */ if (ldt_info.base_addr == 0 && ldt_info.limit == 0) { @@ -219,6 +236,7 @@ install: *lp = entry_1; *(lp+1) = entry_2; error = 0; + preempt_enable(); out_unlock: up(&mm->context.sem); @@ -246,3 +264,26 @@ asmlinkage int sys_modify_ldt(int func, } return ret; } + +/* + * load one particular LDT into the current CPU + */ +void load_LDT_nolock(mm_context_t *pc, int cpu) +{ + struct page **pages = pc->ldt_pages; + int count = pc->size; + int nr_pages, i; + + if (likely(!count)) { + pages = &default_ldt_page; + count = 5; + } + nr_pages = (count*LDT_ENTRY_SIZE + PAGE_SIZE-1) / PAGE_SIZE; + + for (i = 0; i < nr_pages; i++) { + __kunmap_atomic_type(KM_LDT_PAGE0 - i); + __kmap_atomic(pages[i], KM_LDT_PAGE0 - i); + } + set_ldt_desc(cpu, (void *)__kmap_atomic_vaddr(KM_LDT_PAGE0), count); + load_LDT_desc(); +} --- linux/arch/i386/kernel/Makefile.orig +++ linux/arch/i386/kernel/Makefile @@ -7,7 +7,7 @@ extra-y := head.o init_task.o obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \ - doublefault.o + doublefault.o entry_trampoline.o obj-y += cpu/ obj-y += timers/ --- linux/arch/i386/kernel/mpparse.c.orig +++ linux/arch/i386/kernel/mpparse.c @@ -654,7 +654,7 @@ void __init get_smp_config (void) * Read the physical hardware table. Anything here will * override the defaults. */ - if (!smp_read_mpc((void *)mpf->mpf_physptr)) { + if (!smp_read_mpc((void *)phys_to_virt(mpf->mpf_physptr))) { smp_found_config = 0; printk(KERN_ERR "BIOS bug, MP table errors detected!...\n"); printk(KERN_ERR "... disabling SMP support. (tell your hw vendor)\n"); --- linux/arch/i386/kernel/process.c.orig +++ linux/arch/i386/kernel/process.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_MATH_EMULATION #include #endif @@ -262,9 +263,8 @@ void release_thread(struct task_struct * if (dead_task->mm) { // temporary debugging check if (dead_task->mm->context.size) { - printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", + printk("WARNING: dead process %8s still has LDT? <%d>\n", dead_task->comm, - dead_task->mm->context.ldt, dead_task->mm->context.size); BUG(); } @@ -299,7 +299,17 @@ int copy_thread(int nr, unsigned long cl p->thread.esp = (unsigned long) childregs; p->thread.esp0 = (unsigned long) (childregs+1); - p->thread.eip = (unsigned long) ret_from_fork; + /* + * get the two stack pages, for the virtual stack. + * + * IMPORTANT: this code relies on the fact that the task + * structure is an 8K aligned piece of physical memory. + */ + p->thread.stack_page0 = virt_to_page((unsigned long)p->thread_info); + p->thread.stack_page1 = virt_to_page((unsigned long)p->thread_info + PAGE_SIZE); + + p->thread.eip = (unsigned long) __ENTRY_TRAMP_ADDR(ret_from_fork); + p->thread_info->real_stack = p->thread_info; savesegment(fs,p->thread.fs); savesegment(gs,p->thread.gs); @@ -451,10 +461,26 @@ struct task_struct * __switch_to(struct unlazy_fpu(prev_p); +#ifdef CONFIG_X86_HIGH_ENTRY /* - * Reload esp0, LDT and the page table pointer: + * Set the ptes of the virtual stack. (NOTE: a one-page TLB flush is + * needed because otherwise NMIs could interrupt the + * user-return code with a virtual stack and stale TLBs.) */ - load_esp0(tss, next->esp0); + __kunmap_atomic_type(KM_VSTACK0); + __kunmap_atomic_type(KM_VSTACK1); + __kmap_atomic(next->stack_page0, KM_VSTACK0); + __kmap_atomic(next->stack_page1, KM_VSTACK1); + + /* + * Reload esp0: + */ + /* + * NOTE: here we rely on the task being the stack as well + */ + next_p->thread_info->virtual_stack = (void *)__kmap_atomic_vaddr(KM_VSTACK0); +#endif + load_esp0(tss, virtual_esp0(next_p)); /* * Load the per-thread Thread-Local Storage descriptor. --- linux/arch/i386/kernel/signal.c.orig +++ linux/arch/i386/kernel/signal.c @@ -440,7 +440,7 @@ static void setup_rt_frame(int sig, stru /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); - err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(current->sas_ss_sp, (unsigned long *)&frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->esp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); --- linux/arch/i386/kernel/smp.c.orig +++ linux/arch/i386/kernel/smp.c @@ -324,10 +324,12 @@ asmlinkage void smp_invalidate_interrupt if (flush_mm == cpu_tlbstate[cpu].active_mm) { if (cpu_tlbstate[cpu].state == TLBSTATE_OK) { +#ifndef CONFIG_X86_SWITCH_PAGETABLES if (flush_va == FLUSH_ALL) local_flush_tlb(); else __flush_tlb_one(flush_va); +#endif } else leave_mm(cpu); } @@ -382,20 +384,6 @@ static void flush_tlb_others (unsigned l spin_unlock(&tlbstate_lock); } -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - unsigned long cpu_mask; - - preempt_disable(); - cpu_mask = mm->cpu_vm_mask & ~(1UL << smp_processor_id()); - - local_flush_tlb(); - if (cpu_mask) - flush_tlb_others(cpu_mask, mm, FLUSH_ALL); - preempt_enable(); -} - void flush_tlb_mm (struct mm_struct * mm) { unsigned long cpu_mask; @@ -425,7 +413,10 @@ void flush_tlb_page(struct vm_area_struc if (current->active_mm == mm) { if(current->mm) - __flush_tlb_one(va); +#ifndef CONFIG_X86_SWITCH_PAGETABLES + __flush_tlb_one(va) +#endif + ; else leave_mm(smp_processor_id()); } --- linux/arch/i386/kernel/traps.c.orig +++ linux/arch/i386/kernel/traps.c @@ -57,8 +57,8 @@ asmlinkage int system_call(void); asmlinkage void lcall7(void); asmlinkage void lcall27(void); -struct desc_struct default_ldt[] = { { 0, 0 }, { 0, 0 }, { 0, 0 }, - { 0, 0 }, { 0, 0 } }; +struct desc_struct default_ldt[] __attribute__((__section__(".data.default_ldt"))) = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } }; +struct page *default_ldt_page; /* Do we ignore FPU interrupts ? */ char ignore_fpu_irq = 0; @@ -770,19 +770,51 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -#ifdef CONFIG_X86_F00F_BUG -void __init trap_init_f00f_bug(void) +void __init trap_init_virtual_IDT(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. + * "idt" is magic - it overlaps the idt_descr + * variable so that updating idt will automatically + * update the idt descriptor.. */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); + __set_fixmap(FIX_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + idt_descr.address = __fix_to_virt(FIX_IDT); + __asm__ __volatile__("lidt %0": "=m" (idt_descr)); } + +void __init trap_init_virtual_GDT(void) +{ + int cpu = smp_processor_id(); + struct Xgt_desc_struct *gdt_desc = cpu_gdt_descr + cpu; + struct Xgt_desc_struct tmp_desc = {0, 0}; + struct tss_struct * t; + + __asm__ __volatile__("sgdt %0": "=m" (tmp_desc): :"memory"); + +#ifdef CONFIG_X86_HIGH_ENTRY + if (!cpu) { + __set_fixmap(FIX_GDT_0, __pa(cpu_gdt_table), PAGE_KERNEL); + __set_fixmap(FIX_GDT_1, __pa(cpu_gdt_table) + PAGE_SIZE, PAGE_KERNEL); + __set_fixmap(FIX_TSS_0, __pa(init_tss), PAGE_KERNEL); + __set_fixmap(FIX_TSS_1, __pa(init_tss) + PAGE_SIZE, PAGE_KERNEL); + } + + gdt_desc->address = __fix_to_virt(FIX_GDT_0) + sizeof(cpu_gdt_table[0]) * cpu; +#else + gdt_desc->address = (unsigned long)cpu_gdt_table[cpu]; #endif + __asm__ __volatile__("lgdt %0": "=m" (*gdt_desc)); + +#ifdef CONFIG_X86_HIGH_ENTRY + t = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else + t = init_tss + cpu; +#endif + set_tss_desc(cpu, t); + cpu_gdt_table[cpu][GDT_ENTRY_TSS].b &= 0xfffffdff; + load_TR_desc(); +} #define _set_gate(gate_addr,type,dpl,addr,seg) \ do { \ @@ -806,30 +838,27 @@ do { \ */ void set_intr_gate(unsigned int n, void *addr) { + addr = ENTRY_TRAMP_ADDR(addr); _set_gate(idt_table+n,14,0,addr,__KERNEL_CS); } -static void __init set_trap_gate(unsigned int n, void *addr) +void __init set_trap_gate(unsigned int n, void *addr) { + addr = ENTRY_TRAMP_ADDR(addr); _set_gate(idt_table+n,15,0,addr,__KERNEL_CS); } -static void __init set_system_gate(unsigned int n, void *addr) +void __init set_system_gate(unsigned int n, void *addr) { + addr = ENTRY_TRAMP_ADDR(addr); _set_gate(idt_table+n,15,3,addr,__KERNEL_CS); } -static void __init set_call_gate(void *a, void *addr) -{ - _set_gate(a,12,3,addr,__KERNEL_CS); -} - static void __init set_task_gate(unsigned int n, unsigned int gdt_entry) { _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); } - #ifdef CONFIG_EISA int EISA_bus; #endif @@ -845,6 +874,7 @@ void __init trap_init(void) #ifdef CONFIG_X86_LOCAL_APIC init_apic_mappings(); #endif + init_entry_mappings(); set_trap_gate(0,÷_error); set_intr_gate(1,&debug); @@ -872,13 +902,6 @@ void __init trap_init(void) set_system_gate(SYSCALL_VECTOR,&system_call); /* - * default LDT is a single-entry callgate to lcall7 for iBCS - * and a callgate to lcall27 for Solaris/x86 binaries - */ - set_call_gate(&default_ldt[0],lcall7); - set_call_gate(&default_ldt[4],lcall27); - - /* * Should be a barrier for any external CPU state. */ cpu_init(); --- linux/arch/i386/kernel/vm86.c.orig +++ linux/arch/i386/kernel/vm86.c @@ -117,7 +117,7 @@ struct pt_regs * save_v86_state(struct k tss = init_tss + get_cpu(); current->thread.esp0 = current->thread.saved_esp0; - load_esp0(tss, current->thread.esp0); + load_esp0(tss, virtual_esp0(current)); current->thread.saved_esp0 = 0; put_cpu(); @@ -167,6 +167,8 @@ out: +asmlinkage void resume_userspace(void); + static int do_vm86_irq_handling(int subfunction, int irqnumber); static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); @@ -294,7 +296,8 @@ static void do_sys_vm86(struct kernel_vm asm volatile("movl %%gs,%0":"=m" (tsk->thread.saved_gs)); tss = init_tss + get_cpu(); - tss->esp0 = tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tsk->thread.esp0 = (unsigned long) &info->VM86_TSS_ESP0; + tss->esp0 = virtual_esp0(tsk); disable_sysenter(tss); put_cpu(); @@ -305,9 +308,9 @@ static void do_sys_vm86(struct kernel_vm "xorl %%eax,%%eax; movl %%eax,%%fs; movl %%eax,%%gs\n\t" "movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" - "jmp resume_userspace" + "pushl %2; ret;" : /* no outputs */ - :"r" (&info->regs), "r" (tsk->thread_info) : "ax"); + :"r" (&info->regs), "r" (tsk->thread_info), "g" (ENTRY_TRAMP_ADDR(resume_userspace)) : "ax"); /* we never return here */ } @@ -319,8 +322,8 @@ static inline void return_to_32bit(struc regs32->eax = retval; __asm__ __volatile__("movl %0,%%esp\n\t" "movl %1,%%ebp\n\t" - "jmp resume_userspace" - : : "r" (regs32), "r" (current_thread_info())); + "pushl %2; ret;" + : : "r" (regs32), "r" (current_thread_info()), "g" (ENTRY_TRAMP_ADDR(resume_userspace)) ); } static inline void set_IF(struct kernel_vm86_regs * regs) --- linux/arch/i386/kernel/doublefault.c.orig +++ linux/arch/i386/kernel/doublefault.c @@ -7,12 +7,13 @@ #include #include #include +#include #define DOUBLEFAULT_STACKSIZE (1024) static unsigned long doublefault_stack[DOUBLEFAULT_STACKSIZE]; #define STACK_START (unsigned long)(doublefault_stack+DOUBLEFAULT_STACKSIZE) -#define ptr_ok(x) ((x) > 0xc0000000 && (x) < 0xc1000000) +#define ptr_ok(x) (((x) > __PAGE_OFFSET && (x) < (__PAGE_OFFSET + 0x01000000)) || ((x) >= FIXADDR_START)) static void doublefault_fn(void) { @@ -38,8 +39,8 @@ static void doublefault_fn(void) printk("eax = %08lx, ebx = %08lx, ecx = %08lx, edx = %08lx\n", t->eax, t->ebx, t->ecx, t->edx); - printk("esi = %08lx, edi = %08lx\n", - t->esi, t->edi); + printk("esi = %08lx, edi = %08lx, ebp = %08lx\n", + t->esi, t->edi, t->ebp); } } --- linux/arch/i386/kernel/vsyscall.lds.orig +++ linux/arch/i386/kernel/vsyscall.lds @@ -5,7 +5,7 @@ */ /* This must match . */ -VSYSCALL_BASE = 0xffffe000; +VSYSCALL_BASE = 0xffffd000; SECTIONS { --- linux/arch/i386/kernel/sysenter.c.orig +++ linux/arch/i386/kernel/sysenter.c @@ -24,13 +24,17 @@ extern asmlinkage void sysenter_entry(vo void enable_sep_cpu(void *info) { int cpu = get_cpu(); +#ifdef CONFIG_X86_HIGH_ENTRY + struct tss_struct *tss = (struct tss_struct *) __fix_to_virt(FIX_TSS_0) + cpu; +#else struct tss_struct *tss = init_tss + cpu; +#endif tss->ss1 = __KERNEL_CS; tss->esp1 = sizeof(struct tss_struct) + (unsigned long) tss; wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); wrmsr(MSR_IA32_SYSENTER_ESP, tss->esp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) sysenter_entry, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) __ENTRY_TRAMP_ADDR(sysenter_entry), 0); printk("Enabling SEP on CPU %d\n", cpu); put_cpu(); @@ -49,7 +53,8 @@ static int __init sysenter_setup(void) __set_fixmap(FIX_VSYSCALL, __pa(page), PAGE_READONLY); - if (!boot_cpu_has(X86_FEATURE_SEP)) { + // TODO: sysenter entries are broken at the moment. + if (1 || !boot_cpu_has(X86_FEATURE_SEP)) { memcpy((void *) page, &vsyscall_int80_start, &vsyscall_int80_end - &vsyscall_int80_start); --- linux/arch/i386/mm/extable.c.orig +++ linux/arch/i386/mm/extable.c @@ -6,6 +6,52 @@ #include #include #include +#include + +extern struct exception_table_entry __start___ex_table[]; +extern struct exception_table_entry __stop___ex_table[]; + +/* + * The exception table needs to be sorted because we use the macros + * which put things into the exception table in a variety of sections + * as well as the init section and the main kernel text section. + */ +static inline void +sort_ex_table(struct exception_table_entry *start, + struct exception_table_entry *finish) +{ + struct exception_table_entry el, *p, *q; + + /* insertion sort */ + for (p = start + 1; p < finish; ++p) { + /* start .. p-1 is sorted */ + if (p[0].insn < p[-1].insn) { + /* move element p down to its right place */ + el = *p; + q = p; + do { + /* el comes before q[-1], move q[-1] up one */ + q[0] = q[-1]; + --q; + } while (q > start && el.insn < q[-1].insn); + *q = el; + } + } +} + +void fixup_sort_exception_table(void) +{ + struct exception_table_entry *p; + + /* + * Fix up the trampoline exception addresses: + */ + for (p = __start___ex_table; p < __stop___ex_table; p++) { + p->insn = (unsigned long)ENTRY_TRAMP_ADDR((void *)p->insn); + p->fixup = (unsigned long)ENTRY_TRAMP_ADDR((void *)p->fixup); + } + sort_ex_table(__start___ex_table, __stop___ex_table); +} /* Simple binary search */ const struct exception_table_entry * @@ -15,13 +61,15 @@ search_extable(const struct exception_ta { while (first <= last) { const struct exception_table_entry *mid; - long diff; mid = (last - first) / 2 + first; - diff = mid->insn - value; - if (diff == 0) + /* + * careful, the distance between entries can be + * larger than 2GB: + */ + if (mid->insn == value) return mid; - else if (diff < 0) + else if (mid->insn < value) first = mid+1; else last = mid-1; --- linux/arch/i386/mm/fault.c.orig +++ linux/arch/i386/mm/fault.c @@ -79,6 +79,14 @@ asmlinkage void do_page_fault(struct pt_ /* get the address */ __asm__("movl %%cr2,%0":"=r" (address)); +#ifdef CONFIG_X86_SWITCH_PAGETABLES + if (!user_mode(regs)) { + console_verbose(); + printk("invalid kernel-mode pagefault %ld! [addr:%08lx, eip:%08lx]\n", error_code, address, regs->eip); + show_regs(regs); + BUG(); + } +#endif /* It's safe to allow irq's after cr2 has been saved */ if (regs->eflags & X86_EFLAGS_IF) --- linux/arch/i386/mm/init.c.orig +++ linux/arch/i386/mm/init.c @@ -41,125 +41,13 @@ #include #include #include +#include DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static int do_test_wp_bit(void); -/* - * Creates a middle page table and puts a pointer to it in the - * given global directory entry. This only returns the gd entry - * in non-PAE compilation mode, since the middle layer is folded. - */ -static pmd_t * __init one_md_table_init(pgd_t *pgd) -{ - pmd_t *pmd_table; - -#ifdef CONFIG_X86_PAE - pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) - BUG(); -#else - pmd_table = pmd_offset(pgd, 0); -#endif - - return pmd_table; -} - -/* - * Create a page table and place a pointer to it in a middle page - * directory entry. - */ -static pte_t * __init one_page_table_init(pmd_t *pmd) -{ - if (pmd_none(*pmd)) { - pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); - set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); - if (page_table != pte_offset_kernel(pmd, 0)) - BUG(); - - return page_table; - } - - return pte_offset_kernel(pmd, 0); -} - -/* - * This function initializes a certain range of kernel virtual memory - * with new bootmem page tables, everywhere page tables are missing in - * the given range. - */ - -/* - * NOTE: The pagetables are allocated contiguous on the physical space - * so we can cache the place of the first one and move around without - * checking the pgd every time. - */ -static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) -{ - pgd_t *pgd; - pmd_t *pmd; - int pgd_idx, pmd_idx; - unsigned long vaddr; - - vaddr = start; - pgd_idx = pgd_index(vaddr); - pmd_idx = pmd_index(vaddr); - pgd = pgd_base + pgd_idx; - - for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { - if (pgd_none(*pgd)) - one_md_table_init(pgd); - - pmd = pmd_offset(pgd, vaddr); - for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) - one_page_table_init(pmd); - - vaddr += PMD_SIZE; - } - pmd_idx = 0; - } -} - -/* - * This maps the physical memory to kernel virtual address space, a total - * of max_low_pfn pages, by creating page tables starting from address - * PAGE_OFFSET. - */ -static void __init kernel_physical_mapping_init(pgd_t *pgd_base) -{ - unsigned long pfn; - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - int pgd_idx, pmd_idx, pte_ofs; - - pgd_idx = pgd_index(PAGE_OFFSET); - pgd = pgd_base + pgd_idx; - pfn = 0; - - for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { - pmd = one_md_table_init(pgd); - if (pfn >= max_low_pfn) - continue; - for (pmd_idx = 0; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { - /* Map with big pages if possible, otherwise create normal page tables. */ - if (cpu_has_pse) { - set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); - pfn += PTRS_PER_PTE; - } else { - pte = one_page_table_init(pmd); - - for (pte_ofs = 0; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) - set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); - } - } - } -} - static inline int page_kills_ppro(unsigned long pagenr) { if (pagenr >= 0x70000 && pagenr <= 0x7003F) @@ -189,38 +77,15 @@ static inline int page_is_ram(unsigned l return 0; } -#ifdef CONFIG_HIGHMEM pte_t *kmap_pte; -pgprot_t kmap_prot; #define kmap_get_fixmap_pte(vaddr) \ pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { - unsigned long kmap_vstart; - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = kmap_get_fixmap_pte(kmap_vstart); - - kmap_prot = PAGE_KERNEL; -} - -void __init permanent_kmaps_init(pgd_t *pgd_base) -{ - pgd_t *pgd; - pmd_t *pmd; - pte_t *pte; - unsigned long vaddr; - - vaddr = PKMAP_BASE; - page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); - - pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); - pte = pte_offset_kernel(pmd, vaddr); - pkmap_page_table = pte; + kmap_pte = kmap_get_fixmap_pte(__fix_to_virt(FIX_KMAP_BEGIN)); } void __init one_highpage_init(struct page *page, int pfn, int bad_ppro) @@ -247,12 +112,6 @@ void __init set_highmem_pages_init(int b extern void set_highmem_pages_init(int); #endif /* !CONFIG_DISCONTIGMEM */ -#else -#define kmap_init() do { } while (0) -#define permanent_kmaps_init(pgd_base) do { } while (0) -#define set_highmem_pages_init(bad_ppro) do { } while (0) -#endif /* CONFIG_HIGHMEM */ - unsigned long __PAGE_KERNEL = _PAGE_KERNEL; #ifndef CONFIG_DISCONTIGMEM @@ -261,69 +120,191 @@ unsigned long __PAGE_KERNEL = _PAGE_KERN extern void __init remap_numa_kva(void); #endif -static void __init pagetable_init (void) +static __init void prepare_pagetables(pgd_t *pgd_base, unsigned long address) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_base + pgd_index(address); + pmd = pmd_offset(pgd, address); + if (!pmd_present(*pmd)) { + pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); + } +} + +static void __init fixrange_init (unsigned long start, unsigned long end, pgd_t *pgd_base) { unsigned long vaddr; - pgd_t *pgd_base = swapper_pg_dir; + for (vaddr = start; vaddr != end; vaddr += PAGE_SIZE) + prepare_pagetables(pgd_base, vaddr); +} + +static void setup_identity_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) +{ + unsigned long vaddr; + pgd_t *pgd; + int i, j, k; + pmd_t *pmd; + pte_t *pte, *pte_base; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + if (cpu_has_pse) { + unsigned long __pe; + + set_in_cr4(X86_CR4_PSE); + boot_cpu_data.wp_works_ok = 1; + __pe = _KERNPG_TABLE + _PAGE_PSE + vaddr - start; + /* Make it "global" too if supported */ + if (cpu_has_pge) { + set_in_cr4(X86_CR4_PGE); +#if !defined(CONFIG_X86_SWITCH_PAGETABLES) + __pe += _PAGE_GLOBAL; + __PAGE_KERNEL |= _PAGE_GLOBAL; +#endif + } + set_pmd(pmd, __pmd(__pe)); + continue; + } + + pte_base = pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + for (k = 0; k < PTRS_PER_PTE; pte++, k++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE + k*PAGE_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + *pte = mk_pte_phys(vaddr-start, PAGE_KERNEL); + } + set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); + } + } +} + +/* + * Clear kernel pagetables in a PMD_SIZE-aligned range. + */ +static void clear_mappings(pgd_t *pgd_base, unsigned long start, unsigned long end) +{ + unsigned long vaddr; + pgd_t *pgd; + pmd_t *pmd; + int i, j; + + pgd = pgd_base; + + for (i = 0; i < PTRS_PER_PGD; pgd++, i++) { + vaddr = i*PGDIR_SIZE; + if (end && (vaddr >= end)) + break; + pmd = pmd_offset(pgd, 0); + for (j = 0; j < PTRS_PER_PMD; pmd++, j++) { + vaddr = i*PGDIR_SIZE + j*PMD_SIZE; + if (end && (vaddr >= end)) + break; + if (vaddr < start) + continue; + pmd_clear(pmd); + } + } + flush_tlb_all(); +} + +static void __init pagetable_init (void) +{ + unsigned long vaddr, end; + pgd_t *pgd_base; #ifdef CONFIG_X86_PAE int i; - /* Init entries of the first-level page table to the zero page */ - for (i = 0; i < PTRS_PER_PGD; i++) - set_pgd(pgd_base + i, __pgd(__pa(empty_zero_page) | _PAGE_PRESENT)); #endif - /* Enable PSE if available */ - if (cpu_has_pse) { - set_in_cr4(X86_CR4_PSE); - } + /* + * This can be zero as well - no problem, in that case we exit + * the loops anyway due to the PTRS_PER_* conditions. + */ + end = (unsigned long)__va(max_low_pfn*PAGE_SIZE); - /* Enable PGE if available */ - if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); - __PAGE_KERNEL |= _PAGE_GLOBAL; + pgd_base = swapper_pg_dir; +#ifdef CONFIG_X86_PAE + /* + * It causes too many problems if there's no proper pmd set up + * for all 4 entries of the PGD - so we allocate all of them. + * PAE systems will not miss this extra 4-8K anyway ... + */ + for (i = 0; i < PTRS_PER_PGD; i++) { + pmd_t *pmd = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + set_pgd(pgd_base + i, __pgd(__pa(pmd) + 0x1)); } +#endif + /* + * Set up lowmem-sized identity mappings at PAGE_OFFSET: + */ + setup_identity_mappings(pgd_base, PAGE_OFFSET, end); - kernel_physical_mapping_init(pgd_base); - remap_numa_kva(); + /* + * Add flat-mode identity-mappings - SMP needs it when + * starting up on an AP from real-mode. (In the non-PAE + * case we already have these mappings through head.S.) + * All user-space mappings are explicitly cleared after + * SMP startup. + */ +#if CONFIG_SMP && CONFIG_X86_PAE + setup_identity_mappings(pgd_base, 0, 16*1024*1024); +#endif /* * Fixed mappings, only the page table structure has to be * created - mappings will be set by set_fixmap(): */ vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; - page_table_range_init(vaddr, 0, pgd_base); + fixrange_init(vaddr, 0, pgd_base); - permanent_kmaps_init(pgd_base); +#if CONFIG_HIGHMEM + { + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; -#ifdef CONFIG_X86_PAE - /* - * Add low memory identity-mappings - SMP needs it when - * starting up on an AP from real-mode. In the non-PAE - * case we already have these mappings through head.S. - * All user-space mappings are explicitly cleared after - * SMP startup. - */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; + /* + * Permanent kmaps: + */ + vaddr = PKMAP_BASE; + fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); + + pgd = swapper_pg_dir + pgd_index(vaddr); + pmd = pmd_offset(pgd, vaddr); + pte = pte_offset_kernel(pmd, vaddr); + pkmap_page_table = pte; + } #endif } -void zap_low_mappings (void) +#undef VSYSCALL_BASE +extern char VSYSCALL_BASE; + +void __init zap_low_mappings(void) { - int i; + printk("zapping low mappings.\n"); /* * Zap initial low-memory mappings. - * - * Note that "pgd_clear()" doesn't do it for - * us, because pgd_clear() is a no-op on i386. */ - for (i = 0; i < USER_PTRS_PER_PGD; i++) -#ifdef CONFIG_X86_PAE - set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); -#else - set_pgd(swapper_pg_dir+i, __pgd(0)); -#endif - flush_tlb_all(); + clear_mappings(swapper_pg_dir, 0, 16*1024*1024); + + BUG_ON(&VSYSCALL_BASE != (char *)__fix_to_virt(FIX_VSYSCALL)); } #ifndef CONFIG_DISCONTIGMEM @@ -428,6 +409,8 @@ extern void set_max_mapnr_init(void); static struct kcore_list kcore_mem, kcore_vmalloc; +extern void fixup_sort_exception_table(void); + void __init mem_init(void) { extern int ppro_with_ram_bug(void); @@ -435,6 +418,8 @@ void __init mem_init(void) int tmp; int bad_ppro; + fixup_sort_exception_table(); + #ifndef CONFIG_DISCONTIGMEM if (!mem_map) BUG(); @@ -510,6 +495,9 @@ void __init mem_init(void) #ifndef CONFIG_SMP zap_low_mappings(); #endif + entry_trampoline_setup(); + default_ldt_page = virt_to_page(default_ldt); + load_LDT(&init_mm.context); } #ifdef CONFIG_X86_PAE --- linux/arch/i386/mm/pgtable.c.orig +++ linux/arch/i386/mm/pgtable.c @@ -159,16 +159,32 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL); if (pgd) { + unsigned long pmd; +#ifdef CONFIG_X86_4G_VM_LAYOUT + pmd_t *pmd0, *kernel_pmd0; +#endif + for (i = 0; i < USER_PTRS_PER_PGD; i++) { - unsigned long pmd = __get_free_page(GFP_KERNEL); + pmd = __get_free_page(GFP_KERNEL); if (!pmd) goto out_oom; clear_page(pmd); set_pgd(pgd + i, __pgd(1 + __pa(pmd))); } +#ifdef CONFIG_X86_4G_VM_LAYOUT + /* + * In the 4G userspace case alias the top 16 MB virtual + * memory range into the user mappings as well (these + * include the trampoline and CPU data structures). + */ + pmd0 = (pmd_t *)pmd; + kernel_pmd0 = (pmd_t *)pgd_page(swapper_pg_dir[PTRS_PER_PGD-1]); + memcpy(pmd0 + PTRS_PER_PMD - NR_SHARED_PMDS, kernel_pmd0 + PTRS_PER_PMD - NR_SHARED_PMDS, sizeof(pmd_t) * NR_SHARED_PMDS); +#else memcpy(pgd + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#endif } return pgd; out_oom: @@ -194,10 +210,16 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); if (pgd) { - memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); - memcpy(pgd + USER_PTRS_PER_PGD, - swapper_pg_dir + USER_PTRS_PER_PGD, - (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#ifdef CONFIG_X86_4G_VM_LAYOUT + memset(pgd, 0, (PTRS_PER_PGD-2) * sizeof(pgd_t)); + pgd[PTRS_PER_PGD-2] = swapper_pg_dir[PTRS_PER_PGD-2]; + pgd[PTRS_PER_PGD-1] = swapper_pg_dir[PTRS_PER_PGD-1]; +#else + memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); + memcpy(pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); +#endif } return pgd; } --- linux/arch/i386/lib/checksum.S.orig +++ linux/arch/i386/lib/checksum.S @@ -280,14 +280,14 @@ unsigned int csum_partial_copy_generic ( .previous .align 4 -.globl csum_partial_copy_generic +.globl direct_csum_partial_copy_generic #ifndef CONFIG_X86_USE_PPRO_CHECKSUM #define ARGBASE 16 #define FP 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: subl $4,%esp pushl %edi pushl %esi @@ -422,7 +422,7 @@ DST( movb %cl, (%edi) ) #define ARGBASE 12 -csum_partial_copy_generic: +direct_csum_partial_copy_generic: pushl %ebx pushl %edi pushl %esi --- linux/arch/i386/lib/usercopy.c.orig +++ linux/arch/i386/lib/usercopy.c @@ -76,7 +76,7 @@ do { \ * and returns @count. */ long -__strncpy_from_user(char *dst, const char __user *src, long count) +__direct_strncpy_from_user(char *dst, const char __user *src, long count) { long res; __do_strncpy_from_user(dst, src, count, res); @@ -102,7 +102,7 @@ __strncpy_from_user(char *dst, const cha * and returns @count. */ long -strncpy_from_user(char *dst, const char __user *src, long count) +direct_strncpy_from_user(char *dst, const char __user *src, long count) { long res = -EFAULT; if (access_ok(VERIFY_READ, src, 1)) @@ -147,7 +147,7 @@ do { \ * On success, this will be zero. */ unsigned long -clear_user(void __user *to, unsigned long n) +direct_clear_user(void __user *to, unsigned long n) { if (access_ok(VERIFY_WRITE, to, n)) __do_clear_user(to, n); @@ -166,7 +166,7 @@ clear_user(void __user *to, unsigned lon * On success, this will be zero. */ unsigned long -__clear_user(void __user *to, unsigned long n) +__direct_clear_user(void __user *to, unsigned long n) { __do_clear_user(to, n); return n; @@ -183,7 +183,7 @@ __clear_user(void __user *to, unsigned l * On exception, returns 0. * If the string is too long, returns a value greater than @n. */ -long strnlen_user(const char __user *s, long n) +long direct_strnlen_user(const char __user *s, long n) { unsigned long mask = -__addr_ok(s); unsigned long res, tmp; @@ -550,3 +550,4 @@ unsigned long __copy_from_user_ll(void * n = __copy_user_zeroing_intel(to, (const void *) from, n); return n; } + --- linux/arch/i386/boot/setup.S.orig +++ linux/arch/i386/boot/setup.S @@ -162,7 +162,7 @@ cmd_line_ptr: .long 0 # (Header versio # can be located anywhere in # low memory 0x10000 or higher. -ramdisk_max: .long MAXMEM-1 # (Header version 0x0203 or later) +ramdisk_max: .long __MAXMEM-1 # (Header version 0x0203 or later) # The highest safe address for # the contents of an initrd --- linux/arch/i386/vmlinux.lds.S.orig +++ linux/arch/i386/vmlinux.lds.S @@ -3,6 +3,8 @@ */ #include +#include +#include OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") OUTPUT_ARCH(i386) @@ -10,7 +12,11 @@ ENTRY(startup_32) jiffies = jiffies_64; SECTIONS { - . = 0xC0000000 + 0x100000; +#ifdef CONFIG_X86_4G + . = 0x02000000 + 0x100000; +#else + . = 0xc0000000 + 0x100000; +#endif /* read-only */ _text = .; /* Text and read-only data */ .text : { @@ -51,6 +57,11 @@ SECTIONS . = ALIGN(8192); /* init_task */ .data.init_task : { *(.data.init_task) } + entry_tramp_start = .; + . = ALIGN(4096); /* kernel entry code */ + .entry.text : { *(.entry.text) } + entry_tramp_end = .; + /* will be freed after init */ . = ALIGN(4096); /* Init code and data */ __init_begin = .; @@ -102,7 +113,19 @@ SECTIONS . = ALIGN(4096); __init_end = .; /* freed after init ends here */ - + + . = ALIGN(4096); + .data.page_aligned_tss : { *(.data.tss) } + + . = ALIGN(4096); + .data.page_aligned_default_ldt : { *(.data.default_ldt) } + + . = ALIGN(4096); + .data.page_aligned_idt : { *(.data.idt) } + + . = ALIGN(4096); + .data.page_aligned_gdt : { *(.data.gdt) } + __bss_start = .; /* BSS */ .bss : { *(.bss) } __bss_stop = .; --- linux/arch/i386/Kconfig.orig +++ linux/arch/i386/Kconfig @@ -397,6 +397,43 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_4G + bool "4 GB kernel-space and 4 GB user-space virtual memory support" + help + This option is only useful for systems that have more than 1 GB + of RAM. + + The default kernel VM layout leaves 1 GB of virtual memory for + kernel-space mappings, and 3 GB of VM for user-space applications. + This option ups both the kernel-space VM and the user-space VM to + 4 GB. + + The cost of this option is additional TLB flushes done at + system-entry points that transition from user-mode into kernel-mode. + I.e. system calls and page faults, and IRQs that interrupt user-mode + code. There's also additional overhead to kernel operations that copy + memory to/from user-space. The overhead from this is hard to tell and + depends on the workload - it can be anything from no visible overhead + to 20-30% overhead. A good rule of thumb is to count with a runtime + overhead of 20%. + + The upside is the much increased kernel-space VM, which more than + quadruples the maximum amount of RAM supported. Kernels compiled with + this option boot on 64GB of RAM and still have more than 3.1 GB of + 'lowmem' left. Another bonus is that highmem IO bouncing decreases, + if used with drivers that still use bounce-buffers. + + There's also a 33% increase in user-space VM size - database + applications might see a boost from this. + + But the cost of the TLB flushes and the runtime overhead has to be + weighed against the bonuses offered by the larger VM spaces. The + dividing line depends on the actual workload - there might be 4 GB + systems that benefit from this option. Systems with less than 4 GB + of RAM will rarely see a benefit from this option - but it's not + out of question, the exact circumstances have to be considered. + + config HUGETLB_PAGE bool "Huge TLB Page Support" help --- linux/kernel/ksyms.c.orig +++ linux/kernel/ksyms.c @@ -127,7 +127,6 @@ EXPORT_SYMBOL(blk_congestion_wait); EXPORT_SYMBOL(kmap_high); EXPORT_SYMBOL(kunmap_high); EXPORT_SYMBOL(highmem_start_page); -EXPORT_SYMBOL(kmap_prot); EXPORT_SYMBOL(kmap_pte); #endif #ifdef HASHED_PAGE_VIRTUAL --- linux/mm/Makefile.orig +++ linux/mm/Makefile @@ -12,3 +12,8 @@ obj-y := bootmem.o filemap.o mempool.o slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o + +# TODO: there should be a generic 'indirect uaccess' config flag: + +obj-$(CONFIG_X86_4G) += usercopy.o + --- linux/mm/memory.c.orig +++ linux/mm/memory.c @@ -100,7 +100,8 @@ static inline void free_one_pmd(struct m pte_free_tlb(tlb, page); } -static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir) +static inline void free_one_pgd(struct mmu_gather *tlb, pgd_t * dir, + int pgd_idx) { int j; pmd_t * pmd; @@ -114,8 +115,13 @@ static inline void free_one_pgd(struct m } pmd = pmd_offset(dir, 0); pgd_clear(dir); - for (j = 0; j < PTRS_PER_PMD ; j++) + for (j = 0; j < PTRS_PER_PMD ; j++) { +#ifdef PAGE_OFFSET_USER + if (pgd_idx * PGDIR_SIZE + j * PMD_SIZE >= PAGE_OFFSET_USER) + break; +#endif free_one_pmd(tlb, pmd+j); + } pmd_free_tlb(tlb, pmd); } @@ -128,11 +134,13 @@ static inline void free_one_pgd(struct m void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr) { pgd_t * page_dir = tlb->mm->pgd; + int pgd_idx = first; page_dir += first; do { - free_one_pgd(tlb, page_dir); + free_one_pgd(tlb, page_dir, pgd_idx); page_dir++; + pgd_idx++; } while (--nr); } @@ -430,7 +438,7 @@ zap_pmd_range(struct mmu_gather *tlb, pg unsigned long address, unsigned long size) { pmd_t * pmd; - unsigned long end; + unsigned long end, pgd_boundary; if (pgd_none(*dir)) return; @@ -441,8 +449,9 @@ zap_pmd_range(struct mmu_gather *tlb, pg } pmd = pmd_offset(dir, address); end = address + size; - if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) - end = ((address + PGDIR_SIZE) & PGDIR_MASK); + pgd_boundary = ((address + PGDIR_SIZE) & PGDIR_MASK); + if (pgd_boundary && (end > pgd_boundary)) + end = pgd_boundary; do { zap_pte_range(tlb, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; --- linux/mm/usercopy.c.orig +++ linux/mm/usercopy.c @@ -0,0 +1,269 @@ +/* + * linux/mm/usercopy.c + * + * (C) Copyright 2003 Ingo Molnar + * + * Generic implementation of all the user-VM access functions, without + * relying on being able to access the VM directly. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Get kernel address of the user page and pin it. + */ +static inline struct page *pin_page(unsigned long addr, int write) +{ + struct mm_struct *mm = current->mm ? : &init_mm; + struct page *page = NULL; + int ret; + + spin_lock(&mm->page_table_lock); + /* + * Do a quick atomic lookup first - this is the fastpath. + */ + page = follow_page(mm, addr, write); + if (likely(page != NULL)) { + if (!PageReserved(page)) + get_page(page); + spin_unlock(&mm->page_table_lock); + return page; + } + + /* + * No luck - bad address or need to fault in the page: + */ + spin_unlock(&mm->page_table_lock); + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, addr, 1, write, 0, &page, NULL); + up_read(&mm->mmap_sem); + if (ret <= 0) + return NULL; + return page; +} + +static inline void unpin_page(struct page *page) +{ + put_page(page); +} + +/* + * Access another process' address space. + * Source/target buffer must be kernel space, + * Do not walk the page table directly, use get_user_pages + */ +static int rw_vm(unsigned long addr, void *buf, int len, int write) +{ + if (!len) + return 0; + + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + struct page *page = NULL; + int bytes, offset; + void *maddr; + + page = pin_page(addr, write); + if (!page) + break; + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap_atomic(page, KM_USER_COPY); + +#define HANDLE_TYPE(type) \ + case sizeof(type): *(type *)(maddr+offset) = *(type *)(buf); break; + + if (write) { + switch (bytes) { + HANDLE_TYPE(char); + HANDLE_TYPE(int); + HANDLE_TYPE(long long); + default: + memcpy(maddr + offset, buf, bytes); + } + } else { +#undef HANDLE_TYPE +#define HANDLE_TYPE(type) \ + case sizeof(type): *(type *)(buf) = *(type *)(maddr+offset); break; + switch (bytes) { + HANDLE_TYPE(char); + HANDLE_TYPE(int); + HANDLE_TYPE(long long); + default: + memcpy(buf, maddr + offset, bytes); + } +#undef HANDLE_TYPE + } + kunmap_atomic(maddr, KM_USER_COPY); + unpin_page(page); + len -= bytes; + buf += bytes; + addr += bytes; + } + + return len; +} + +static int str_vm(unsigned long addr, void *buf0, int len, int copy) +{ + struct mm_struct *mm = current->mm ? : &init_mm; + struct page *page; + void *buf = buf0; + + if (!len) + return len; + + down_read(&mm->mmap_sem); + /* ignore errors, just check how much was sucessfully transfered */ + while (len) { + int bytes, ret, offset, left, copied; + char *maddr; + + ret = get_user_pages(current, mm, addr, 1, copy == 2, 0, &page, NULL); + if (ret <= 0) { + up_read(&mm->mmap_sem); + return -EFAULT; + } + + bytes = len; + offset = addr & (PAGE_SIZE-1); + if (bytes > PAGE_SIZE-offset) + bytes = PAGE_SIZE-offset; + + maddr = kmap_atomic(page, KM_USER_COPY); + if (copy == 2) { + memset(maddr + offset, 0, bytes); + copied = bytes; + left = 0; + } else if (copy == 1) { + left = strncpy_count(buf, maddr + offset, bytes); + copied = bytes - left; + } else { + copied = strnlen(maddr + offset, bytes); + left = bytes - copied; + } + BUG_ON(bytes < 0 || copied < 0); + kunmap_atomic(maddr, KM_USER_COPY); + page_cache_release(page); + len -= copied; + buf += copied; + addr += copied; + if (left) + break; + } + up_read(&mm->mmap_sem); + + return len; +} + +/* + * Copies memory from userspace (ptr) into kernelspace (val). + * + * returns # of bytes not copied. + */ +int get_user_size(unsigned int size, void *val, const void *ptr) +{ + int ret; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memcpy(val, ptr, size); + return 0; + } + ret = rw_vm((unsigned long)ptr, val, size, 0); + if (ret) + /* + * Zero the rest: + */ + memset(val + size - ret, 0, ret); + return ret; +} + +/* + * Copies memory from kernelspace (val) into userspace (ptr). + * + * returns # of bytes not copied. + */ +int put_user_size(unsigned int size, const void *val, void *ptr) +{ + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memcpy(ptr, val, size); + return 0; + } + return rw_vm((unsigned long)ptr, (void *)val, size, 1); +} + +int copy_str_fromuser_size(unsigned int size, void *val, const void *ptr) +{ + int copied, left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + left = strncpy_count(val, ptr, size); + copied = size - left; + BUG_ON(copied < 0); + + return copied; + } + left = str_vm((unsigned long)ptr, val, size, 1); + if (left < 0) + return left; + copied = size - left; + BUG_ON(copied < 0); + + return copied; +} + +int strlen_fromuser_size(unsigned int size, const void *ptr) +{ + int copied, left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + copied = strnlen(ptr, size) + 1; + BUG_ON(copied < 0); + + return copied; + } + left = str_vm((unsigned long)ptr, NULL, size, 0); + if (left < 0) + return 0; + copied = size - left + 1; + BUG_ON(copied < 0); + + return copied; +} + +int zero_user_size(unsigned int size, void *ptr) +{ + int left; + + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) { + memset(ptr, 0, size); + return 0; + } + left = str_vm((unsigned long)ptr, NULL, size, 2); + if (left < 0) + return size; + return left; +} + +EXPORT_SYMBOL(get_user_size); +EXPORT_SYMBOL(put_user_size); +EXPORT_SYMBOL(zero_user_size); +EXPORT_SYMBOL(copy_str_fromuser_size); +EXPORT_SYMBOL(strlen_fromuser_size); +