diff -uprN linux-2.6.32/arch/alpha/include/asm/errno.h linux-2.6.32.ovz/arch/alpha/include/asm/errno.h --- linux-2.6.32/arch/alpha/include/asm/errno.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/errno.h 2015-03-10 13:24:05.000000000 -0700 @@ -43,7 +43,7 @@ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ -#define ESTALE 70 /* Stale NFS file handle */ +#define ESTALE 70 /* Stale file handle */ #define EREMOTE 71 /* Object is remote */ #define ENOLCK 77 /* No record locks available */ @@ -122,4 +122,6 @@ #define ERFKILL 138 /* Operation not possible due to RF-kill */ +#define EHWPOISON 139 /* Memory page has hardware error */ + #endif diff -uprN linux-2.6.32/arch/alpha/include/asm/mman.h linux-2.6.32.ovz/arch/alpha/include/asm/mman.h --- linux-2.6.32/arch/alpha/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/mman.h 2015-06-23 04:16:16.000000000 -0700 @@ -53,6 +53,13 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + +#define MADV_DONTDUMP 16 /* Explicity exclude from the core dump, + overrides the coredump filter bits */ +#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ + /* compatibility flags */ #define MAP_FILE 0 diff -uprN linux-2.6.32/arch/alpha/include/asm/mmzone.h linux-2.6.32.ovz/arch/alpha/include/asm/mmzone.h --- linux-2.6.32/arch/alpha/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/mmzone.h 2015-03-10 13:22:35.000000000 -0700 @@ -56,7 +56,6 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, * Given a kernel address, find the home node of the underlying memory. */ #define kvaddr_to_nid(kaddr) pa_to_nid(__pa(kaddr)) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) /* * Given a kaddr, LOCAL_BASE_ADDR finds the owning node of the memory diff -uprN linux-2.6.32/arch/alpha/include/asm/module.h linux-2.6.32.ovz/arch/alpha/include/asm/module.h --- linux-2.6.32/arch/alpha/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -6,6 +6,7 @@ struct mod_arch_specific unsigned int gotsecindex; }; +#define MODULES_ARE_ELF64 #define Elf_Sym Elf64_Sym #define Elf_Shdr Elf64_Shdr #define Elf_Ehdr Elf64_Ehdr @@ -13,6 +14,8 @@ struct mod_arch_specific #define Elf_Dyn Elf64_Dyn #define Elf_Rel Elf64_Rel #define Elf_Rela Elf64_Rela +#define ELF_R_TYPE(X) ELF64_R_TYPE(X) +#define ELF_R_SYM(X) ELF64_R_SYM(X) #define ARCH_SHF_SMALL SHF_ALPHA_GPREL diff -uprN linux-2.6.32/arch/alpha/include/asm/socket.h linux-2.6.32.ovz/arch/alpha/include/asm/socket.h --- linux-2.6.32/arch/alpha/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -67,6 +67,8 @@ #define SO_TIMESTAMPING 37 #define SCM_TIMESTAMPING SO_TIMESTAMPING +#define SO_RXQ_OVFL 40 + /* O_NONBLOCK clashes with the bits used for socket types. Therefore we * have to define SOCK_NONBLOCK to a different value here. */ diff -uprN linux-2.6.32/arch/alpha/include/asm/unistd.h linux-2.6.32.ovz/arch/alpha/include/asm/unistd.h --- linux-2.6.32/arch/alpha/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -433,10 +433,11 @@ #define __NR_signalfd 476 #define __NR_timerfd 477 #define __NR_eventfd 478 +#define __NR_recvmmsg 479 #ifdef __KERNEL__ -#define NR_SYSCALLS 479 +#define NR_SYSCALLS 480 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff -uprN linux-2.6.32/arch/alpha/kernel/osf_sys.c linux-2.6.32.ovz/arch/alpha/kernel/osf_sys.c --- linux-2.6.32/arch/alpha/kernel/osf_sys.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/kernel/osf_sys.c 2015-03-10 13:23:54.000000000 -0700 @@ -178,25 +178,18 @@ SYSCALL_DEFINE6(osf_mmap, unsigned long, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - struct file *file = NULL; - unsigned long ret = -EBADF; + unsigned long ret = -EINVAL; #if 0 if (flags & (_MAP_HASSEMAPHORE | _MAP_INHERIT | _MAP_UNALIGNED)) printk("%s: unimplemented OSF mmap flags %04lx\n", current->comm, flags); #endif - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - ret = do_mmap(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + if ((off + PAGE_ALIGN(len)) < off) + goto out; + if (off & ~PAGE_MASK) + goto out; + ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return ret; } @@ -241,11 +234,11 @@ linux_to_osf_statfs(struct kstatfs *linu } static int -do_osf_statfs(struct dentry * dentry, struct osf_statfs __user *buffer, +do_osf_statfs(struct path *path, struct osf_statfs __user *buffer, unsigned long bufsiz) { struct kstatfs linux_stat; - int error = vfs_statfs(dentry, &linux_stat); + int error = vfs_statfs(path, &linux_stat); if (!error) error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz); return error; @@ -259,7 +252,7 @@ SYSCALL_DEFINE3(osf_statfs, char __user retval = user_path(pathname, &path); if (!retval) { - retval = do_osf_statfs(path.dentry, buffer, bufsiz); + retval = do_osf_statfs(&path, buffer, bufsiz); path_put(&path); } return retval; @@ -274,7 +267,7 @@ SYSCALL_DEFINE3(osf_fstatfs, unsigned lo retval = -EBADF; file = fget(fd); if (file) { - retval = do_osf_statfs(file->f_path.dentry, buffer, bufsiz); + retval = do_osf_statfs(&file->f_path, buffer, bufsiz); fput(file); } return retval; diff -uprN linux-2.6.32/arch/alpha/kernel/pci-sysfs.c linux-2.6.32.ovz/arch/alpha/kernel/pci-sysfs.c --- linux-2.6.32/arch/alpha/kernel/pci-sysfs.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/kernel/pci-sysfs.c 2015-03-10 13:21:32.000000000 -0700 @@ -52,6 +52,7 @@ static int __pci_mmap_fits(struct pci_de /** * pci_mmap_resource - map a PCI resource into user memory space + * @filp: open sysfs file * @kobj: kobject for mapping * @attr: struct bin_attribute for the file being mapped * @vma: struct vm_area_struct passed into the mmap @@ -59,7 +60,8 @@ static int __pci_mmap_fits(struct pci_de * * Use the bus mapping routines to map a PCI resource into userspace. */ -static int pci_mmap_resource(struct kobject *kobj, struct bin_attribute *attr, +static int pci_mmap_resource(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, struct vm_area_struct *vma, int sparse) { struct pci_dev *pdev = to_pci_dev(container_of(kobj, @@ -88,14 +90,14 @@ static int pci_mmap_resource(struct kobj return hose_mmap_page_range(pdev->sysdata, vma, mmap_type, sparse); } -static int pci_mmap_resource_sparse(struct kobject *kobj, +static int pci_mmap_resource_sparse(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, struct vm_area_struct *vma) { return pci_mmap_resource(kobj, attr, vma, 1); } -static int pci_mmap_resource_dense(struct kobject *kobj, +static int pci_mmap_resource_dense(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, struct vm_area_struct *vma) { diff -uprN linux-2.6.32/arch/alpha/kernel/systbls.S linux-2.6.32.ovz/arch/alpha/kernel/systbls.S --- linux-2.6.32/arch/alpha/kernel/systbls.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/alpha/kernel/systbls.S 2015-03-10 13:21:18.000000000 -0700 @@ -497,6 +497,7 @@ sys_call_table: .quad sys_signalfd .quad sys_ni_syscall .quad sys_eventfd + .quad sys_recvmmsg .size sys_call_table, . - sys_call_table .type sys_call_table, @object diff -uprN linux-2.6.32/arch/arm/include/asm/mman.h linux-2.6.32.ovz/arch/arm/include/asm/mman.h --- linux-2.6.32/arch/arm/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/include/asm/mman.h 2015-03-10 13:21:11.000000000 -0700 @@ -1 +1,4 @@ #include + +#define arch_mmap_check(addr, len, flags) \ + (((flags) & MAP_FIXED && (addr) < FIRST_USER_ADDRESS) ? -EINVAL : 0) diff -uprN linux-2.6.32/arch/arm/include/asm/module.h linux-2.6.32.ovz/arch/arm/include/asm/module.h --- linux-2.6.32/arch/arm/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -1,9 +1,14 @@ #ifndef _ASM_ARM_MODULE_H #define _ASM_ARM_MODULE_H +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) struct unwind_table; diff -uprN linux-2.6.32/arch/arm/include/asm/socket.h linux-2.6.32.ovz/arch/arm/include/asm/socket.h --- linux-2.6.32/arch/arm/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/arm/include/asm/unistd.h linux-2.6.32.ovz/arch/arm/include/asm/unistd.h --- linux-2.6.32/arch/arm/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -391,6 +391,7 @@ #define __NR_pwritev (__NR_SYSCALL_BASE+362) #define __NR_rt_tgsigqueueinfo (__NR_SYSCALL_BASE+363) #define __NR_perf_event_open (__NR_SYSCALL_BASE+364) +#define __NR_recvmmsg (__NR_SYSCALL_BASE+365) /* * The following SWIs are ARM private. diff -uprN linux-2.6.32/arch/arm/kernel/calls.S linux-2.6.32.ovz/arch/arm/kernel/calls.S --- linux-2.6.32/arch/arm/kernel/calls.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/kernel/calls.S 2015-03-10 13:21:18.000000000 -0700 @@ -172,7 +172,7 @@ /* 160 */ CALL(sys_sched_get_priority_min) CALL(sys_sched_rr_get_interval) CALL(sys_nanosleep) - CALL(sys_arm_mremap) + CALL(sys_mremap) CALL(sys_setresuid16) /* 165 */ CALL(sys_getresuid16) CALL(sys_ni_syscall) /* vm86 */ @@ -374,6 +374,7 @@ CALL(sys_pwritev) CALL(sys_rt_tgsigqueueinfo) CALL(sys_perf_event_open) +/* 365 */ CALL(sys_recvmmsg) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff -uprN linux-2.6.32/arch/arm/kernel/entry-common.S linux-2.6.32.ovz/arch/arm/kernel/entry-common.S --- linux-2.6.32/arch/arm/kernel/entry-common.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/kernel/entry-common.S 2015-03-10 13:21:11.000000000 -0700 @@ -416,12 +416,12 @@ sys_mmap2: tst r5, #PGOFF_MASK moveq r5, r5, lsr #PAGE_SHIFT - 12 streq r5, [sp, #4] - beq do_mmap2 + beq sys_mmap_pgoff mov r0, #-EINVAL mov pc, lr #else str r5, [sp, #4] - b do_mmap2 + b sys_mmap_pgoff #endif ENDPROC(sys_mmap2) diff -uprN linux-2.6.32/arch/arm/kernel/smp.c linux-2.6.32.ovz/arch/arm/kernel/smp.c --- linux-2.6.32/arch/arm/kernel/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/kernel/smp.c 2015-03-10 13:23:56.000000000 -0700 @@ -563,19 +563,6 @@ int setup_profiling_timer(unsigned int m return -EINVAL; } -static void -on_each_cpu_mask(void (*func)(void *), void *info, int wait, - const struct cpumask *mask) -{ - preempt_disable(); - - smp_call_function_many(mask, func, info, wait); - if (cpumask_test_cpu(smp_processor_id(), mask)) - func(info); - - preempt_enable(); -} - /**********************************************************************/ /* @@ -638,7 +625,7 @@ void flush_tlb_all(void) void flush_tlb_mm(struct mm_struct *mm) { if (tlb_ops_need_broadcast()) - on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mm_cpumask(mm)); + on_each_cpu_mask(mm_cpumask(mm), ipi_flush_tlb_mm, mm, 1); else local_flush_tlb_mm(mm); } @@ -649,7 +636,8 @@ void flush_tlb_page(struct vm_area_struc struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = uaddr; - on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_page, + &ta, 1); } else local_flush_tlb_page(vma, uaddr); } @@ -672,7 +660,8 @@ void flush_tlb_range(struct vm_area_stru ta.ta_vma = vma; ta.ta_start = start; ta.ta_end = end; - on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm)); + on_each_cpu_mask(mm_cpumask(vma->vm_mm), ipi_flush_tlb_range, + &ta, 1); } else local_flush_tlb_range(vma, start, end); } diff -uprN linux-2.6.32/arch/arm/kernel/sys_arm.c linux-2.6.32.ovz/arch/arm/kernel/sys_arm.c --- linux-2.6.32/arch/arm/kernel/sys_arm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/kernel/sys_arm.c 2015-03-10 13:23:54.000000000 -0700 @@ -28,41 +28,6 @@ #include #include -extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, - unsigned long new_len, unsigned long flags, - unsigned long new_addr); - -/* common code for old and new mmaps */ -inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EINVAL; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - if (flags & MAP_FIXED && addr < FIRST_USER_ADDRESS) - goto out; - - error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - struct mmap_arg_struct { unsigned long addr; unsigned long len; @@ -84,29 +49,11 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return error; } -asmlinkage unsigned long -sys_arm_mremap(unsigned long addr, unsigned long old_len, - unsigned long new_len, unsigned long flags, - unsigned long new_addr) -{ - unsigned long ret = -EINVAL; - - if (flags & MREMAP_FIXED && new_addr < FIRST_USER_ADDRESS) - goto out; - - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); - -out: - return ret; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. diff -uprN linux-2.6.32/arch/arm/mach-pxa/em-x270.c linux-2.6.32.ovz/arch/arm/mach-pxa/em-x270.c --- linux-2.6.32/arch/arm/mach-pxa/em-x270.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/mach-pxa/em-x270.c 2015-03-10 13:21:04.000000000 -0700 @@ -497,16 +497,15 @@ static int em_x270_usb_hub_init(void) goto err_free_vbus_gpio; /* USB Hub power-on and reset */ - gpio_direction_output(usb_hub_reset, 0); + gpio_direction_output(usb_hub_reset, 1); + gpio_direction_output(GPIO9_USB_VBUS_EN, 0); regulator_enable(em_x270_usb_ldo); - gpio_set_value(usb_hub_reset, 1); gpio_set_value(usb_hub_reset, 0); + gpio_set_value(usb_hub_reset, 1); regulator_disable(em_x270_usb_ldo); regulator_enable(em_x270_usb_ldo); - gpio_set_value(usb_hub_reset, 1); - - /* enable VBUS */ - gpio_direction_output(GPIO9_USB_VBUS_EN, 1); + gpio_set_value(usb_hub_reset, 0); + gpio_set_value(GPIO9_USB_VBUS_EN, 1); return 0; diff -uprN linux-2.6.32/arch/arm/mm/init.c linux-2.6.32.ovz/arch/arm/mm/init.c --- linux-2.6.32/arch/arm/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/mm/init.c 2015-03-10 13:22:29.000000000 -0700 @@ -73,7 +73,7 @@ __tagtable(ATAG_INITRD2, parse_tag_initr */ struct meminfo meminfo; -void show_mem(void) +void show_mem(unsigned int filter) { int free = 0, total = 0, reserved = 0; int shared = 0, cached = 0, slab = 0, node, i; diff -uprN linux-2.6.32/arch/arm/mm/mmap.c linux-2.6.32.ovz/arch/arm/mm/mmap.c --- linux-2.6.32/arch/arm/mm/mmap.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/arm/mm/mmap.c 2015-03-10 13:21:11.000000000 -0700 @@ -54,7 +54,8 @@ arch_get_unmapped_area(struct file *filp * We enforce the MAP_FIXED case. */ if (flags & MAP_FIXED) { - if (aliasing && flags & MAP_SHARED && addr & (SHMLBA - 1)) + if (aliasing && flags & MAP_SHARED && + (addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1)) return -EINVAL; return addr; } diff -uprN linux-2.6.32/arch/avr32/include/asm/socket.h linux-2.6.32.ovz/arch/avr32/include/asm/socket.h --- linux-2.6.32/arch/avr32/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* __ASM_AVR32_SOCKET_H */ diff -uprN linux-2.6.32/arch/avr32/include/asm/syscalls.h linux-2.6.32.ovz/arch/avr32/include/asm/syscalls.h --- linux-2.6.32/arch/avr32/include/asm/syscalls.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/include/asm/syscalls.h 2015-03-10 13:21:11.000000000 -0700 @@ -29,10 +29,6 @@ asmlinkage int sys_sigaltstack(const sta struct pt_regs *); asmlinkage int sys_rt_sigreturn(struct pt_regs *); -/* kernel/sys_avr32.c */ -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, off_t); - /* mm/cache.c */ asmlinkage int sys_cacheflush(int, void __user *, size_t); diff -uprN linux-2.6.32/arch/avr32/kernel/sys_avr32.c linux-2.6.32.ovz/arch/avr32/kernel/sys_avr32.c --- linux-2.6.32/arch/avr32/kernel/sys_avr32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/kernel/sys_avr32.c 2015-03-10 13:21:11.000000000 -0700 @@ -5,39 +5,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include #include -#include -#include -#include - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, off_t offset) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return error; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, offset); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - return error; -} - int kernel_execve(const char *file, char **argv, char **envp) { register long scno asm("r8") = __NR_execve; diff -uprN linux-2.6.32/arch/avr32/kernel/syscall-stubs.S linux-2.6.32.ovz/arch/avr32/kernel/syscall-stubs.S --- linux-2.6.32/arch/avr32/kernel/syscall-stubs.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/kernel/syscall-stubs.S 2015-03-10 13:21:11.000000000 -0700 @@ -61,7 +61,7 @@ __sys_execve: __sys_mmap2: pushm lr st.w --sp, ARG6 - call sys_mmap2 + call sys_mmap_pgoff sub sp, -4 popm pc diff -uprN linux-2.6.32/arch/avr32/kernel/syscall_table.S linux-2.6.32.ovz/arch/avr32/kernel/syscall_table.S --- linux-2.6.32/arch/avr32/kernel/syscall_table.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/avr32/kernel/syscall_table.S 2015-03-10 13:21:18.000000000 -0700 @@ -295,4 +295,5 @@ sys_call_table: .long sys_signalfd .long sys_ni_syscall /* 280, was sys_timerfd */ .long sys_eventfd + .long sys_recvmmsg .long sys_ni_syscall /* r8 is saturated at nr_syscalls */ diff -uprN linux-2.6.32/arch/blackfin/include/asm/unistd.h linux-2.6.32.ovz/arch/blackfin/include/asm/unistd.h --- linux-2.6.32/arch/blackfin/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/blackfin/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -388,8 +388,9 @@ #define __NR_pwritev 367 #define __NR_rt_tgsigqueueinfo 368 #define __NR_perf_event_open 369 +#define __NR_recvmmsg 370 -#define __NR_syscall 370 +#define __NR_syscall 371 #define NR_syscalls __NR_syscall /* Old optional stuff no one actually uses */ diff -uprN linux-2.6.32/arch/blackfin/kernel/sys_bfin.c linux-2.6.32.ovz/arch/blackfin/kernel/sys_bfin.c --- linux-2.6.32/arch/blackfin/kernel/sys_bfin.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/blackfin/kernel/sys_bfin.c 2015-03-10 13:21:11.000000000 -0700 @@ -22,39 +22,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long -do_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - asmlinkage void *sys_sram_alloc(size_t size, unsigned long flags) { return sram_alloc_with_lsl(size, flags); diff -uprN linux-2.6.32/arch/blackfin/mach-common/entry.S linux-2.6.32.ovz/arch/blackfin/mach-common/entry.S --- linux-2.6.32/arch/blackfin/mach-common/entry.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/blackfin/mach-common/entry.S 2015-03-10 13:21:18.000000000 -0700 @@ -1422,7 +1422,7 @@ ENTRY(_sys_call_table) .long _sys_ni_syscall /* streams2 */ .long _sys_vfork /* 190 */ .long _sys_getrlimit - .long _sys_mmap2 + .long _sys_mmap_pgoff .long _sys_truncate64 .long _sys_ftruncate64 .long _sys_stat64 /* 195 */ @@ -1600,6 +1600,7 @@ ENTRY(_sys_call_table) .long _sys_pwritev .long _sys_rt_tgsigqueueinfo .long _sys_perf_event_open + .long _sys_recvmmsg /* 370 */ .rept NR_syscalls-(.-_sys_call_table)/4 .long _sys_ni_syscall diff -uprN linux-2.6.32/arch/cris/include/asm/module.h linux-2.6.32.ovz/arch/cris/include/asm/module.h --- linux-2.6.32/arch/cris/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/cris/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -3,7 +3,12 @@ /* cris is simple */ struct mod_arch_specific { }; +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #endif /* _ASM_CRIS_MODULE_H */ diff -uprN linux-2.6.32/arch/cris/include/asm/socket.h linux-2.6.32.ovz/arch/cris/include/asm/socket.h --- linux-2.6.32/arch/cris/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/cris/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -62,6 +62,7 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/cris/kernel/sys_cris.c linux-2.6.32.ovz/arch/cris/kernel/sys_cris.c --- linux-2.6.32/arch/cris/kernel/sys_cris.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/cris/kernel/sys_cris.c 2015-03-10 13:21:11.000000000 -0700 @@ -26,31 +26,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long -do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage unsigned long old_mmap(unsigned long __user *args) { unsigned long buffer[6]; @@ -63,7 +38,7 @@ asmlinkage unsigned long old_mmap(unsign if (buffer[5] & ~PAGE_MASK) /* verify that offset is on page boundary */ goto out; - err = do_mmap2(buffer[0], buffer[1], buffer[2], buffer[3], + err = sys_mmap_pgoff(buffer[0], buffer[1], buffer[2], buffer[3], buffer[4], buffer[5] >> PAGE_SHIFT); out: return err; @@ -73,7 +48,8 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + /* bug(?): 8Kb pages here */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* diff -uprN linux-2.6.32/arch/frv/include/asm/socket.h linux-2.6.32.ovz/arch/frv/include/asm/socket.h --- linux-2.6.32/arch/frv/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/frv/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -60,5 +60,6 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/frv/kernel/sys_frv.c linux-2.6.32.ovz/arch/frv/kernel/sys_frv.c --- linux-2.6.32/arch/frv/kernel/sys_frv.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/frv/kernel/sys_frv.c 2015-03-10 13:21:11.000000000 -0700 @@ -31,9 +31,6 @@ asmlinkage long sys_mmap2(unsigned long unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - int error = -EBADF; - struct file * file = NULL; - /* As with sparc32, make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have.... */ @@ -41,69 +38,10 @@ asmlinkage long sys_mmap2(unsigned long trying to map something we can't */ if (pgoff & ((1 << (PAGE_SHIFT - 12)) - 1)) return -EINVAL; - pgoff >>= PAGE_SHIFT - 12; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -#if 0 /* DAVIDM - do we want this */ -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return error; + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } -#endif /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. diff -uprN linux-2.6.32/arch/h8300/include/asm/module.h linux-2.6.32.ovz/arch/h8300/include/asm/module.h --- linux-2.6.32/arch/h8300/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/h8300/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -4,9 +4,14 @@ * This file contains the H8/300 architecture specific module code. */ struct mod_arch_specific { }; +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #define MODULE_SYMBOL_PREFIX "_" diff -uprN linux-2.6.32/arch/h8300/include/asm/socket.h linux-2.6.32.ovz/arch/h8300/include/asm/socket.h --- linux-2.6.32/arch/h8300/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/h8300/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/h8300/kernel/syscalls.S linux-2.6.32.ovz/arch/h8300/kernel/syscalls.S --- linux-2.6.32/arch/h8300/kernel/syscalls.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/h8300/kernel/syscalls.S 2015-03-10 13:21:11.000000000 -0700 @@ -206,7 +206,7 @@ SYMBOL_NAME_LABEL(sys_call_table) .long SYMBOL_NAME(sys_ni_syscall) /* streams2 */ .long SYMBOL_NAME(sys_vfork) /* 190 */ .long SYMBOL_NAME(sys_getrlimit) - .long SYMBOL_NAME(sys_mmap2) + .long SYMBOL_NAME(sys_mmap_pgoff) .long SYMBOL_NAME(sys_truncate64) .long SYMBOL_NAME(sys_ftruncate64) .long SYMBOL_NAME(sys_stat64) /* 195 */ diff -uprN linux-2.6.32/arch/h8300/kernel/sys_h8300.c linux-2.6.32.ovz/arch/h8300/kernel/sys_h8300.c --- linux-2.6.32/arch/h8300/kernel/sys_h8300.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/h8300/kernel/sys_h8300.c 2015-03-10 13:21:11.000000000 -0700 @@ -26,39 +26,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/m68k cloned Linux/i386, which didn't use to be able to @@ -87,58 +54,12 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } -#if 0 /* DAVIDM - do we want this */ -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return error; -} -#endif - struct sel_arg_struct { unsigned long n; fd_set *inp, *outp, *exp; diff -uprN linux-2.6.32/arch/ia64/ia32/binfmt_elf32.c linux-2.6.32.ovz/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.32/arch/ia64/ia32/binfmt_elf32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/ia32/binfmt_elf32.c 2015-03-10 13:21:14.000000000 -0700 @@ -89,6 +89,7 @@ ia64_elf32_init (struct pt_regs *regs) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = IA32_GDT_OFFSET; vma->vm_end = vma->vm_start + PAGE_SIZE; @@ -114,6 +115,7 @@ ia64_elf32_init (struct pt_regs *regs) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = IA32_GATE_OFFSET; vma->vm_end = vma->vm_start + PAGE_SIZE; @@ -138,6 +140,7 @@ ia64_elf32_init (struct pt_regs *regs) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = IA32_LDT_OFFSET; vma->vm_end = vma->vm_start + PAGE_ALIGN(IA32_LDT_ENTRIES*IA32_LDT_ENTRY_SIZE); diff -uprN linux-2.6.32/arch/ia64/ia32/sys_ia32.c linux-2.6.32.ovz/arch/ia64/ia32/sys_ia32.c --- linux-2.6.32/arch/ia64/ia32/sys_ia32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/ia32/sys_ia32.c 2015-03-10 13:23:54.000000000 -0700 @@ -86,7 +86,7 @@ sys32_execve (char __user *name, compat_ struct pt_regs *regs) { long error; - char *filename; + struct filename *filename; unsigned long old_map_base, old_task_size, tssd; filename = getname(name); @@ -104,7 +104,7 @@ sys32_execve (char __user *name, compat_ ia64_set_kr(IA64_KR_IO_BASE, current->thread.old_iob); ia64_set_kr(IA64_KR_TSSD, current->thread.old_k1); - error = compat_do_execve(filename, argv, envp, regs); + error = compat_do_execve(filename->name, argv, envp, regs); putname(filename); if (error < 0) { @@ -858,6 +858,9 @@ ia32_do_mmap (struct file *file, unsigne prot = get_prot32(prot); + if (flags & MAP_HUGETLB) + return -ENOMEM; + #if PAGE_SHIFT > IA32_PAGE_SHIFT mutex_lock(&ia32_mmap_mutex); { diff -uprN linux-2.6.32/arch/ia64/include/asm/acpi.h linux-2.6.32.ovz/arch/ia64/include/asm/acpi.h --- linux-2.6.32/arch/ia64/include/asm/acpi.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/acpi.h 2015-03-10 13:21:25.000000000 -0700 @@ -94,6 +94,7 @@ ia64_acpi_release_global_lock (unsigned #define acpi_noirq 0 /* ACPI always enabled on IA64 */ #define acpi_pci_disabled 0 /* ACPI PCI always enabled on IA64 */ #define acpi_strict 1 /* no ACPI spec workarounds on IA64 */ +#define acpi_ht 0 /* no HT-only mode on IA64 */ #endif #define acpi_processor_cstate_check(x) (x) /* no idle limits on IA64 :) */ static inline void disable_acpi(void) { } diff -uprN linux-2.6.32/arch/ia64/include/asm/cputime.h linux-2.6.32.ovz/arch/ia64/include/asm/cputime.h --- linux-2.6.32/arch/ia64/include/asm/cputime.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/cputime.h 2015-03-10 13:22:21.000000000 -0700 @@ -62,6 +62,12 @@ typedef u64 cputime64_t; #define msecs_to_cputime(__msecs) ((__msecs) * NSEC_PER_MSEC) /* + * Convert cputime <-> microseconds + */ +#define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) +#define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) + +/* * Convert cputime <-> seconds */ #define cputime_to_secs(__ct) ((__ct) / NSEC_PER_SEC) diff -uprN linux-2.6.32/arch/ia64/include/asm/crash.h linux-2.6.32.ovz/arch/ia64/include/asm/crash.h --- linux-2.6.32/arch/ia64/include/asm/crash.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/crash.h 2015-03-10 13:20:57.000000000 -0700 @@ -0,0 +1,90 @@ +#ifndef _ASM_IA64_CRASH_H +#define _ASM_IA64_CRASH_H + +/* + * linux/include/asm-ia64/crash.h + * + * Copyright (c) 2004 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifdef __KERNEL__ + +#include +#include +#include + +static inline void * +map_virtual(u64 offset, struct page **pp) +{ + struct page *page; + unsigned long pfn; + u32 type; + + if (REGION_NUMBER(offset) == 5) { + char byte; + + if (__get_user(byte, (char *)offset) == 0) + return (void *)offset; + else + return NULL; + } + + switch (type = efi_mem_type(offset)) + { + case EFI_LOADER_CODE: + case EFI_LOADER_DATA: + case EFI_BOOT_SERVICES_CODE: + case EFI_BOOT_SERVICES_DATA: + case EFI_CONVENTIONAL_MEMORY: + break; + + default: + printk(KERN_INFO + "crash memory driver: invalid memory type for %lx: %d\n", + offset, type); + return NULL; + } + + pfn = offset >> PAGE_SHIFT; + + if (!pfn_valid(pfn)) { + printk(KERN_INFO + "crash memory driver: invalid pfn: %lx )\n", pfn); + return NULL; + } + + page = pfn_to_page(pfn); + + if (!page->virtual) { + printk(KERN_INFO + "crash memory driver: offset: %lx page: %lx page->virtual: NULL\n", + offset, (unsigned long)page); + return NULL; + } + + return (page->virtual + (offset & (PAGE_SIZE-1))); +} + +static inline void unmap_virtual(struct page *page) +{ + return; +} + +#endif /* __KERNEL__ */ + +#endif /* _ASM_IA64_CRASH_H */ diff -uprN linux-2.6.32/arch/ia64/include/asm/elf.h linux-2.6.32.ovz/arch/ia64/include/asm/elf.h --- linux-2.6.32/arch/ia64/include/asm/elf.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/elf.h 2015-03-10 13:21:24.000000000 -0700 @@ -218,54 +218,6 @@ do { \ NEW_AUX_ENT(AT_SYSINFO_EHDR, (unsigned long) GATE_EHDR); \ } while (0) - -/* - * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out - * extra segments containing the gate DSO contents. Dumping its - * contents makes post-mortem fully interpretable later without matching up - * the same kernel and hardware config to see what PC values meant. - * Dumping its extra ELF program headers includes all the other information - * a debugger needs to easily find how the gate DSO was being used. - */ -#define ELF_CORE_EXTRA_PHDRS (GATE_EHDR->e_phnum) -#define ELF_CORE_WRITE_EXTRA_PHDRS \ -do { \ - const struct elf_phdr *const gate_phdrs = \ - (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); \ - int i; \ - Elf64_Off ofs = 0; \ - for (i = 0; i < GATE_EHDR->e_phnum; ++i) { \ - struct elf_phdr phdr = gate_phdrs[i]; \ - if (phdr.p_type == PT_LOAD) { \ - phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); \ - phdr.p_filesz = phdr.p_memsz; \ - if (ofs == 0) { \ - ofs = phdr.p_offset = offset; \ - offset += phdr.p_filesz; \ - } \ - else \ - phdr.p_offset = ofs; \ - } \ - else \ - phdr.p_offset += ofs; \ - phdr.p_paddr = 0; /* match other core phdrs */ \ - DUMP_WRITE(&phdr, sizeof(phdr)); \ - } \ -} while (0) -#define ELF_CORE_WRITE_EXTRA_DATA \ -do { \ - const struct elf_phdr *const gate_phdrs = \ - (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); \ - int i; \ - for (i = 0; i < GATE_EHDR->e_phnum; ++i) { \ - if (gate_phdrs[i].p_type == PT_LOAD) { \ - DUMP_WRITE((void *) gate_phdrs[i].p_vaddr, \ - PAGE_ALIGN(gate_phdrs[i].p_memsz)); \ - break; \ - } \ - } \ -} while (0) - /* * format for entries in the Global Offset Table */ diff -uprN linux-2.6.32/arch/ia64/include/asm/io.h linux-2.6.32.ovz/arch/ia64/include/asm/io.h --- linux-2.6.32/arch/ia64/include/asm/io.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/io.h 2015-03-10 13:22:30.000000000 -0700 @@ -424,6 +424,13 @@ __writeq (unsigned long val, volatile vo extern void __iomem * ioremap(unsigned long offset, unsigned long size); extern void __iomem * ioremap_nocache (unsigned long offset, unsigned long size); extern void iounmap (volatile void __iomem *addr); +extern void __iomem * early_ioremap (unsigned long phys_addr, unsigned long size); +extern void early_iounmap (volatile void __iomem *addr, unsigned long size); +static inline void __iomem * ioremap_cache (unsigned long phys_addr, unsigned long size) +{ + return ioremap(phys_addr, size); +} + /* * String version of IO memory access ops: diff -uprN linux-2.6.32/arch/ia64/include/asm/kexec.h linux-2.6.32.ovz/arch/ia64/include/asm/kexec.h --- linux-2.6.32/arch/ia64/include/asm/kexec.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/kexec.h 2015-03-10 13:22:20.000000000 -0700 @@ -1,6 +1,7 @@ #ifndef _ASM_IA64_KEXEC_H #define _ASM_IA64_KEXEC_H +#include /* Maximum physical address we can use pages from */ #define KEXEC_SOURCE_MEMORY_LIMIT (-1UL) @@ -19,6 +20,12 @@ flush_icache_range(page_addr, page_addr + PAGE_SIZE); \ } while(0) +#ifdef CONFIG_KEXEC_AUTO_RESERVE +extern +unsigned long long __init arch_default_crash_size(unsigned long long); +#define arch_default_crash_size arch_default_crash_size +#endif + extern struct kimage *ia64_kimage; extern const unsigned int relocate_new_kernel_size; extern void relocate_new_kernel(unsigned long, unsigned long, diff -uprN linux-2.6.32/arch/ia64/include/asm/kvm.h linux-2.6.32.ovz/arch/ia64/include/asm/kvm.h --- linux-2.6.32/arch/ia64/include/asm/kvm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/kvm.h 2015-03-10 13:21:08.000000000 -0700 @@ -60,6 +60,7 @@ struct kvm_ioapic_state { #define KVM_IRQCHIP_PIC_MASTER 0 #define KVM_IRQCHIP_PIC_SLAVE 1 #define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 #define KVM_CONTEXT_SIZE 8*1024 diff -uprN linux-2.6.32/arch/ia64/include/asm/kvm_host.h linux-2.6.32.ovz/arch/ia64/include/asm/kvm_host.h --- linux-2.6.32/arch/ia64/include/asm/kvm_host.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/kvm_host.h 2015-03-10 13:21:08.000000000 -0700 @@ -475,7 +475,6 @@ struct kvm_arch { struct list_head assigned_dev_head; struct iommu_domain *iommu_domain; int iommu_flags; - struct hlist_head irq_ack_notifier_list; unsigned long irq_sources_bitmap; unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; diff -uprN linux-2.6.32/arch/ia64/include/asm/module.h linux-2.6.32.ovz/arch/ia64/include/asm/module.h --- linux-2.6.32/arch/ia64/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -29,9 +29,14 @@ struct mod_arch_specific { unsigned int next_got_entry; /* index of next available got entry */ }; +#define MODULES_ARE_ELF64 #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Ehdr Elf64_Ehdr +#define Elf_Rel Elf64_Rel +#define Elf_Rela Elf64_Rela +#define ELF_R_TYPE(X) ELF64_R_TYPE(X) +#define ELF_R_SYM(X) ELF64_R_SYM(X) #define MODULE_PROC_FAMILY "ia64" #define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY \ diff -uprN linux-2.6.32/arch/ia64/include/asm/numa.h linux-2.6.32.ovz/arch/ia64/include/asm/numa.h --- linux-2.6.32/arch/ia64/include/asm/numa.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/numa.h 2015-03-10 13:21:54.000000000 -0700 @@ -22,8 +22,6 @@ #include -#define NUMA_NO_NODE -1 - extern u16 cpu_to_node_map[NR_CPUS] __cacheline_aligned; extern cpumask_t node_to_cpu_mask[MAX_NUMNODES] __cacheline_aligned; extern pg_data_t *pgdat_list[MAX_NUMNODES]; diff -uprN linux-2.6.32/arch/ia64/include/asm/processor.h linux-2.6.32.ovz/arch/ia64/include/asm/processor.h --- linux-2.6.32/arch/ia64/include/asm/processor.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/processor.h 2015-03-10 13:24:04.000000000 -0700 @@ -361,7 +361,7 @@ struct thread_struct { regs->loadrs = 0; \ regs->r8 = get_dumpable(current->mm); /* set "don't zap registers" flag */ \ regs->r12 = new_sp - 16; /* allocate 16 byte scratch area */ \ - if (unlikely(!get_dumpable(current->mm))) { \ + if (unlikely(get_dumpable(current->mm) != SUID_DUMP_USER)) { \ /* \ * Zap scratch regs to avoid leaking bits between processes with different \ * uid/privileges. \ diff -uprN linux-2.6.32/arch/ia64/include/asm/socket.h linux-2.6.32.ovz/arch/ia64/include/asm/socket.h --- linux-2.6.32/arch/ia64/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/socket.h 2015-03-10 13:24:07.000000000 -0700 @@ -69,4 +69,8 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 + +#define SO_BUSY_POLL 46 + #endif /* _ASM_IA64_SOCKET_H */ diff -uprN linux-2.6.32/arch/ia64/include/asm/unistd.h linux-2.6.32.ovz/arch/ia64/include/asm/unistd.h --- linux-2.6.32/arch/ia64/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -311,11 +311,12 @@ #define __NR_preadv 1319 #define __NR_pwritev 1320 #define __NR_rt_tgsigqueueinfo 1321 +#define __NR_recvmmsg 1322 #ifdef __KERNEL__ -#define NR_syscalls 298 /* length of syscall table */ +#define NR_syscalls 299 /* length of syscall table */ /* * The following defines stop scripts/checksyscalls.sh from complaining about diff -uprN linux-2.6.32/arch/ia64/Kconfig linux-2.6.32.ovz/arch/ia64/Kconfig --- linux-2.6.32/arch/ia64/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/Kconfig 2015-03-10 13:22:20.000000000 -0700 @@ -591,6 +591,20 @@ config KEXEC support. As of this writing the exact hardware interface is strongly in flux, so no good recommendation can be made. +config KEXEC_AUTO_RESERVE + bool "automatically reserve memory for kexec kernel" + depends on KEXEC + default y + ---help--- + Automatically reserve memory for a kexec kernel, so that you don't + need to specify numbers for the "crashkernel=X@Y" boot option, + instead you can use "crashkernel=auto". To make this work, you need + to have more than 4G memory. + + The reserved memory size is different depends on how much memory + you actually have. Please check Documentation/kdump/kdump.txt. + If you doubt, say N. + config CRASH_DUMP bool "kernel crash dumps" depends on IA64_MCA_RECOVERY && !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) diff -uprN linux-2.6.32/arch/ia64/kernel/acpi.c linux-2.6.32.ovz/arch/ia64/kernel/acpi.c --- linux-2.6.32/arch/ia64/kernel/acpi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/acpi.c 2015-03-10 13:22:27.000000000 -0700 @@ -884,8 +884,8 @@ __init void prefill_possible_map(void) possible = available_cpus + additional_cpus; - if (possible > NR_CPUS) - possible = NR_CPUS; + if (possible > nr_cpu_ids) + possible = nr_cpu_ids; printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max((possible - available_cpus), 0)); diff -uprN linux-2.6.32/arch/ia64/kernel/elfcore.c linux-2.6.32.ovz/arch/ia64/kernel/elfcore.c --- linux-2.6.32/arch/ia64/kernel/elfcore.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/elfcore.c 2015-03-10 13:21:24.000000000 -0700 @@ -0,0 +1,80 @@ +#include +#include +#include +#include + +#include + + +Elf64_Half elf_core_extra_phdrs(void) +{ + return GATE_EHDR->e_phnum; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + Elf64_Off ofs = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + struct elf_phdr phdr = gate_phdrs[i]; + + if (phdr.p_type == PT_LOAD) { + phdr.p_memsz = PAGE_ALIGN(phdr.p_memsz); + phdr.p_filesz = phdr.p_memsz; + if (ofs == 0) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset = ofs; + } + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + void *addr = (void *)gate_phdrs[i].p_vaddr; + size_t memsz = PAGE_ALIGN(gate_phdrs[i].p_memsz); + + *size += memsz; + if (*size > limit || !dump_write(file, addr, memsz)) + return 0; + break; + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + const struct elf_phdr *const gate_phdrs = + (const struct elf_phdr *) (GATE_ADDR + GATE_EHDR->e_phoff); + int i; + size_t size = 0; + + for (i = 0; i < GATE_EHDR->e_phnum; ++i) { + if (gate_phdrs[i].p_type == PT_LOAD) { + size += PAGE_ALIGN(gate_phdrs[i].p_memsz); + break; + } + } + return size; +} diff -uprN linux-2.6.32/arch/ia64/kernel/entry.S linux-2.6.32.ovz/arch/ia64/kernel/entry.S --- linux-2.6.32/arch/ia64/kernel/entry.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/entry.S 2015-03-10 13:21:18.000000000 -0700 @@ -1806,6 +1806,7 @@ sys_call_table: data8 sys_preadv data8 sys_pwritev // 1320 data8 sys_rt_tgsigqueueinfo + data8 sys_recvmmsg .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ diff -uprN linux-2.6.32/arch/ia64/kernel/ia64_ksyms.c linux-2.6.32.ovz/arch/ia64/kernel/ia64_ksyms.c --- linux-2.6.32/arch/ia64/kernel/ia64_ksyms.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/ia64_ksyms.c 2015-03-10 13:20:57.000000000 -0700 @@ -84,6 +84,9 @@ EXPORT_SYMBOL(ia64_save_scratch_fpregs); #include EXPORT_SYMBOL(unw_init_running); +#include +EXPORT_SYMBOL_GPL(efi_mem_type); + #if defined(CONFIG_IA64_ESI) || defined(CONFIG_IA64_ESI_MODULE) extern void esi_call_phys (void); EXPORT_SYMBOL_GPL(esi_call_phys); diff -uprN linux-2.6.32/arch/ia64/kernel/kprobes.c linux-2.6.32.ovz/arch/ia64/kernel/kprobes.c --- linux-2.6.32/arch/ia64/kernel/kprobes.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/kprobes.c 2015-03-10 13:22:02.000000000 -0700 @@ -870,7 +870,7 @@ static int __kprobes pre_kprobes_handler return 1; ss_probe: -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) +#if !defined(CONFIG_PREEMPT) if (p->ainsn.inst_flag == INST_FLAG_BOOSTABLE && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ ia64_psr(regs)->ri = p->ainsn.slot; diff -uprN linux-2.6.32/arch/ia64/kernel/machine_kexec.c linux-2.6.32.ovz/arch/ia64/kernel/machine_kexec.c --- linux-2.6.32/arch/ia64/kernel/machine_kexec.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/machine_kexec.c 2015-03-10 13:22:20.000000000 -0700 @@ -162,6 +162,44 @@ void arch_crash_save_vmcoreinfo(void) #endif } +#ifdef CONFIG_KEXEC_AUTO_RESERVE +#define MBYTES(n) ((n)*1024*1024ULL) +#define GBYTES(n) ((n)*1024*1024*1024ULL) +/* + Memory size Reserved memory + =========== =============== + [4G, 12G) 256M + [12G, 128G) 512M + [128G, 256G) 768M + [256G, 378G) 1024M + [378G, 512G) 1536M + [512G, 768G) 2048M + [768G, ) 3072M + */ +unsigned long long __init arch_default_crash_size(unsigned long long total_size) +{ + unsigned long long ret; + + if (total_size >= GBYTES(4) && total_size < GBYTES(12)) + ret = MBYTES(256); + else if (total_size >= GBYTES(12) && total_size < GBYTES(128)) + ret = MBYTES(512); + else if (total_size >= GBYTES(128) && total_size < GBYTES(256)) + ret = MBYTES(768); + else if (total_size >= GBYTES(256) && total_size < GBYTES(378)) + ret = MBYTES(1024); + else if (total_size >= GBYTES(318) && total_size < GBYTES(512)) + ret = MBYTES(1536); + else if (total_size >= GBYTES(512) && total_size < GBYTES(768)) + ret = MBYTES(2048); + else + ret = MBYTES(3072); + return ret; +} +#undef GBYTES +#undef MBYTES +#endif + unsigned long paddr_vmcoreinfo_note(void) { return ia64_tpa((unsigned long)(char *)&vmcoreinfo_note); diff -uprN linux-2.6.32/arch/ia64/kernel/Makefile linux-2.6.32.ovz/arch/ia64/kernel/Makefile --- linux-2.6.32/arch/ia64/kernel/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/Makefile 2015-03-10 13:21:24.000000000 -0700 @@ -51,6 +51,8 @@ endif obj-$(CONFIG_DMAR) += pci-dma.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + # fp_emulate() expects f2-f5,f16-f31 to contain the user-level state. CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31 diff -uprN linux-2.6.32/arch/ia64/kernel/perfmon.c linux-2.6.32.ovz/arch/ia64/kernel/perfmon.c --- linux-2.6.32/arch/ia64/kernel/perfmon.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/perfmon.c 2015-03-10 13:21:36.000000000 -0700 @@ -2206,7 +2206,7 @@ pfm_alloc_file(pfm_context_t *ctx) { struct file *file; struct inode *inode; - struct dentry *dentry; + struct path path; char name[32]; struct qstr this; @@ -2231,18 +2231,19 @@ pfm_alloc_file(pfm_context_t *ctx) /* * allocate a new dcache entry */ - dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); - if (!dentry) { + path.dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); + if (!path.dentry) { iput(inode); return ERR_PTR(-ENOMEM); } + path.mnt = mntget(pfmfs_mnt); - dentry->d_op = &pfmfs_dentry_operations; - d_add(dentry, inode); + path.dentry->d_op = &pfmfs_dentry_operations; + d_add(path.dentry, inode); - file = alloc_file(pfmfs_mnt, dentry, FMODE_READ, &pfm_file_ops); + file = alloc_file(&path, FMODE_READ, &pfm_file_ops); if (!file) { - dput(dentry); + path_put(&path); return ERR_PTR(-ENFILE); } @@ -2320,6 +2321,7 @@ pfm_smpl_buffer_alloc(struct task_struct DPRINT(("Cannot allocate vma\n")); goto error_kmem; } + INIT_LIST_HEAD(&vma->anon_vma_chain); /* * partially initialize the vma for the sampling buffer diff -uprN linux-2.6.32/arch/ia64/kernel/process.c linux-2.6.32.ovz/arch/ia64/kernel/process.c --- linux-2.6.32/arch/ia64/kernel/process.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/process.c 2015-03-10 13:23:54.000000000 -0700 @@ -662,14 +662,14 @@ long sys_execve (char __user *filename, char __user * __user *argv, char __user * __user *envp, struct pt_regs *regs) { - char *fname; + struct filename *fname; int error; fname = getname(filename); error = PTR_ERR(fname); if (IS_ERR(fname)) goto out; - error = do_execve(fname, argv, envp, regs); + error = do_execve(fname->name, argv, envp, regs); putname(fname); out: return error; diff -uprN linux-2.6.32/arch/ia64/kernel/setup.c linux-2.6.32.ovz/arch/ia64/kernel/setup.c --- linux-2.6.32/arch/ia64/kernel/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/setup.c 2015-06-23 04:16:16.000000000 -0700 @@ -273,7 +273,7 @@ static void __init setup_crashkernel(uns int ret; ret = parse_crashkernel(boot_command_line, total, - &size, &base); + &size, &base, NULL); if (ret == 0 && size > 0) { if (!base) { sort_regions(rsvd_region, *n); diff -uprN linux-2.6.32/arch/ia64/kernel/sys_ia64.c linux-2.6.32.ovz/arch/ia64/kernel/sys_ia64.c --- linux-2.6.32/arch/ia64/kernel/sys_ia64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/sys_ia64.c 2015-03-10 13:21:11.000000000 -0700 @@ -100,51 +100,7 @@ sys_getpagesize (void) asmlinkage unsigned long ia64_brk (unsigned long brk) { - unsigned long rlim, retval, newbrk, oldbrk; - struct mm_struct *mm = current->mm; - - /* - * Most of this replicates the code in sys_brk() except for an additional safety - * check and the clearing of r8. However, we can't call sys_brk() because we need - * to acquire the mmap_sem before we can do the test... - */ - down_write(&mm->mmap_sem); - - if (brk < mm->end_code) - goto out; - newbrk = PAGE_ALIGN(brk); - oldbrk = PAGE_ALIGN(mm->brk); - if (oldbrk == newbrk) - goto set_brk; - - /* Always allow shrinking brk. */ - if (brk <= mm->brk) { - if (!do_munmap(mm, newbrk, oldbrk-newbrk)) - goto set_brk; - goto out; - } - - /* Check against unimplemented/unmapped addresses: */ - if ((newbrk - oldbrk) > RGN_MAP_LIMIT || REGION_OFFSET(newbrk) > RGN_MAP_LIMIT) - goto out; - - /* Check against rlimit.. */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; - if (rlim < RLIM_INFINITY && brk - mm->start_data > rlim) - goto out; - - /* Check against existing mmap mappings. */ - if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE)) - goto out; - - /* Ok, looks good - let it rip. */ - if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk) - goto out; -set_brk: - mm->brk = brk; -out: - retval = mm->brk; - up_write(&mm->mmap_sem); + unsigned long retval = sys_brk(brk); force_successful_syscall_return(); return retval; } @@ -185,39 +141,6 @@ int ia64_mmap_check(unsigned long addr, return 0; } -static inline unsigned long -do_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, unsigned long pgoff) -{ - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - - if (!file->f_op || !file->f_op->mmap) { - addr = -ENODEV; - goto out; - } - } - - /* Careful about overflows.. */ - len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) { - addr = -EINVAL; - goto out; - } - - down_write(¤t->mm->mmap_sem); - addr = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - -out: if (file) - fput(file); - return addr; -} - /* * mmap2() is like mmap() except that the offset is expressed in units * of PAGE_SIZE (instead of bytes). This allows to mmap2() (pieces @@ -226,7 +149,7 @@ out: if (file) asmlinkage unsigned long sys_mmap2 (unsigned long addr, unsigned long len, int prot, int flags, int fd, long pgoff) { - addr = do_mmap2(addr, len, prot, flags, fd, pgoff); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; @@ -238,7 +161,7 @@ sys_mmap (unsigned long addr, unsigned l if (offset_in_page(off) != 0) return -EINVAL; - addr = do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + addr = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); if (!IS_ERR((void *) addr)) force_successful_syscall_return(); return addr; diff -uprN linux-2.6.32/arch/ia64/kernel/topology.c linux-2.6.32.ovz/arch/ia64/kernel/topology.c --- linux-2.6.32/arch/ia64/kernel/topology.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kernel/topology.c 2015-03-10 13:21:34.000000000 -0700 @@ -282,7 +282,7 @@ static ssize_t cache_show(struct kobject return ret; } -static struct sysfs_ops cache_sysfs_ops = { +static const struct sysfs_ops cache_sysfs_ops = { .show = cache_show }; diff -uprN linux-2.6.32/arch/ia64/kvm/Kconfig linux-2.6.32.ovz/arch/ia64/kvm/Kconfig --- linux-2.6.32/arch/ia64/kvm/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kvm/Kconfig 2015-03-10 13:20:59.000000000 -0700 @@ -47,6 +47,7 @@ config KVM_INTEL Provides support for KVM on Itanium 2 processors equipped with the VT extensions. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff -uprN linux-2.6.32/arch/ia64/kvm/kvm-ia64.c linux-2.6.32.ovz/arch/ia64/kvm/kvm-ia64.c --- linux-2.6.32/arch/ia64/kvm/kvm-ia64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/kvm/kvm-ia64.c 2015-03-10 13:23:12.000000000 -0700 @@ -124,7 +124,7 @@ long ia64_pal_vp_create(u64 *vpd, u64 *h static DEFINE_SPINLOCK(vp_lock); -void kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void *garbage) { long status; long tmp_base; @@ -137,7 +137,7 @@ void kvm_arch_hardware_enable(void *garb slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT); local_irq_restore(saved_psr); if (slot < 0) - return; + return -EINVAL; spin_lock(&vp_lock); status = ia64_pal_vp_init_env(kvm_vsa_base ? @@ -145,7 +145,7 @@ void kvm_arch_hardware_enable(void *garb __pa(kvm_vm_buffer), KVM_VM_BUFFER_BASE, &tmp_base); if (status != 0) { printk(KERN_WARNING"kvm: Failed to Enable VT Support!!!!\n"); - return ; + return -EINVAL; } if (!kvm_vsa_base) { @@ -154,6 +154,8 @@ void kvm_arch_hardware_enable(void *garb } spin_unlock(&vp_lock); ia64_ptr_entry(0x3, slot); + + return 0; } void kvm_arch_hardware_disable(void *garbage) @@ -239,10 +241,10 @@ static int handle_mmio(struct kvm_vcpu * return 0; mmio: if (p->dir) - r = kvm_io_bus_read(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); else - r = kvm_io_bus_write(&vcpu->kvm->mmio_bus, p->addr, + r = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, p->addr, p->size, &p->data); if (r) printk(KERN_ERR"kvm: No iodevice found! addr:%lx\n", p->addr); @@ -634,12 +636,9 @@ static void kvm_vcpu_post_transition(str static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { union context *host_ctx, *guest_ctx; - int r; + int r, idx; - /* - * down_read() may sleep and return with interrupts enabled - */ - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); again: if (signal_pending(current)) { @@ -661,7 +660,7 @@ again: if (r < 0) goto vcpu_run_fail; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); kvm_guest_enter(); /* @@ -685,7 +684,7 @@ again: kvm_guest_exit(); preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_handle_exit(kvm_run, vcpu); @@ -695,10 +694,10 @@ again: } out: - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); if (r > 0) { kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); goto again; } @@ -851,8 +850,7 @@ static int kvm_vm_ioctl_get_irqchip(stru r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_IOAPIC: - memcpy(&chip->chip.ioapic, ioapic_irqchip(kvm), - sizeof(struct kvm_ioapic_state)); + r = kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -868,9 +866,7 @@ static int kvm_vm_ioctl_set_irqchip(stru r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_IOAPIC: - memcpy(ioapic_irqchip(kvm), - &chip->chip.ioapic, - sizeof(struct kvm_ioapic_state)); + r = kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -985,10 +981,8 @@ long kvm_arch_vm_ioctl(struct file *filp goto out; if (irqchip_in_kernel(kvm)) { __s32 status; - mutex_lock(&kvm->irq_lock); status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); - mutex_unlock(&kvm->irq_lock); if (ioctl == KVM_IRQ_LINE_STATUS) { irq_event.status = status; if (copy_to_user(argp, &irq_event, @@ -1185,6 +1179,11 @@ out: #define PALE_RESET_ENTRY 0x80000000ffffffb0UL +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct kvm_vcpu *v; @@ -1380,12 +1379,14 @@ static void free_kvm(struct kvm *kvm) static void kvm_release_vm_pages(struct kvm *kvm) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i, j; unsigned long base_gfn; - for (i = 0; i < kvm->nmemslots; i++) { - memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) { + memslot = &slots->memslots[i]; base_gfn = memslot->base_gfn; for (j = 0; j < memslot->npages; j++) { @@ -1408,6 +1409,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm kfree(kvm->arch.vioapic); kvm_release_vm_pages(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); free_kvm(kvm); } @@ -1579,15 +1581,15 @@ out: return r; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { unsigned long i; unsigned long pfn; - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; unsigned long base_gfn = memslot->base_gfn; if (base_gfn + npages > (KVM_MAX_MEM_SIZE >> PAGE_SHIFT)) @@ -1611,6 +1613,14 @@ int kvm_arch_set_memory_region(struct kv return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + void kvm_arch_flush_shadow(struct kvm *kvm) { kvm_flush_remote_tlbs(kvm); @@ -1797,7 +1807,8 @@ static int kvm_ia64_sync_dirty_log(struc { struct kvm_memory_slot *memslot; int r, i; - long n, base; + long base; + unsigned long n; unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base + offsetof(struct kvm_vm_data, kvm_mem_dirty_log)); @@ -1805,12 +1816,12 @@ static int kvm_ia64_sync_dirty_log(struc if (log->slot >= KVM_MEMORY_SLOTS) goto out; - memslot = &kvm->memslots[log->slot]; + memslot = &kvm->memslots->memslots[log->slot]; r = -ENOENT; if (!memslot->dirty_bitmap) goto out; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + n = kvm_dirty_bitmap_bytes(memslot); base = memslot->base_gfn / BITS_PER_LONG; for (i = 0; i < n/sizeof(long); ++i) { @@ -1826,10 +1837,11 @@ int kvm_vm_ioctl_get_dirty_log(struct kv struct kvm_dirty_log *log) { int r; - int n; + unsigned long n; struct kvm_memory_slot *memslot; int is_dirty = 0; + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->arch.dirty_log_lock); r = kvm_ia64_sync_dirty_log(kvm, log); @@ -1843,12 +1855,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kv /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { kvm_flush_remote_tlbs(kvm); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; + memslot = &kvm->memslots->memslots[log->slot]; + n = kvm_dirty_bitmap_bytes(memslot); memset(memslot->dirty_bitmap, 0, n); } r = 0; out: + mutex_unlock(&kvm->slots_lock); spin_unlock(&kvm->arch.dirty_log_lock); return r; } diff -uprN linux-2.6.32/arch/ia64/mm/contig.c linux-2.6.32.ovz/arch/ia64/mm/contig.c --- linux-2.6.32/arch/ia64/mm/contig.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/contig.c 2015-03-10 13:22:29.000000000 -0700 @@ -36,7 +36,7 @@ static unsigned long max_gap; * Shows a simple page count of reserved and used pages in the system. * For discontig machines, it does this on a per-pgdat basis. */ -void show_mem(void) +void show_mem(unsigned int filter) { int i, total_reserved = 0; int total_shared = 0, total_cached = 0; diff -uprN linux-2.6.32/arch/ia64/mm/discontig.c linux-2.6.32.ovz/arch/ia64/mm/discontig.c --- linux-2.6.32/arch/ia64/mm/discontig.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/discontig.c 2015-03-10 13:22:29.000000000 -0700 @@ -514,7 +514,7 @@ void __cpuinit *per_cpu_init(void) * Shows a simple page count of reserved and used pages in the system. * For discontig machines, it does this on a per-pgdat basis. */ -void show_mem(void) +void show_mem(unsigned int filter) { int i, total_reserved = 0; int total_shared = 0, total_cached = 0; diff -uprN linux-2.6.32/arch/ia64/mm/hugetlbpage.c linux-2.6.32.ovz/arch/ia64/mm/hugetlbpage.c --- linux-2.6.32/arch/ia64/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/hugetlbpage.c 2015-03-10 13:23:25.000000000 -0700 @@ -26,7 +26,8 @@ unsigned int hpage_shift = HPAGE_SHIFT_D EXPORT_SYMBOL(hpage_shift); pte_t * -huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) +huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz, + bool *shared) { unsigned long taddr = htlbpage_to_page(addr); pgd_t *pgd; diff -uprN linux-2.6.32/arch/ia64/mm/init.c linux-2.6.32.ovz/arch/ia64/mm/init.c --- linux-2.6.32/arch/ia64/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/init.c 2015-03-10 13:21:14.000000000 -0700 @@ -118,6 +118,7 @@ ia64_init_addr_space (void) */ vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_start = current->thread.rbs_bot & PAGE_MASK; vma->vm_end = vma->vm_start + PAGE_SIZE; @@ -136,6 +137,7 @@ ia64_init_addr_space (void) if (!(current->personality & MMAP_PAGE_ZERO)) { vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (vma) { + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = current->mm; vma->vm_end = PAGE_SIZE; vma->vm_page_prot = __pgprot(pgprot_val(PAGE_READONLY) | _PAGE_MA_NAT); diff -uprN linux-2.6.32/arch/ia64/mm/ioremap.c linux-2.6.32.ovz/arch/ia64/mm/ioremap.c --- linux-2.6.32/arch/ia64/mm/ioremap.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/mm/ioremap.c 2015-03-10 13:21:05.000000000 -0700 @@ -22,6 +22,12 @@ __ioremap (unsigned long phys_addr) } void __iomem * +early_ioremap (unsigned long phys_addr, unsigned long size) +{ + return __ioremap(phys_addr); +} + +void __iomem * ioremap (unsigned long phys_addr, unsigned long size) { void __iomem *addr; @@ -102,6 +108,11 @@ ioremap_nocache (unsigned long phys_addr EXPORT_SYMBOL(ioremap_nocache); void +early_iounmap (volatile void __iomem *addr, unsigned long size) +{ +} + +void iounmap (volatile void __iomem *addr) { if (REGION_NUMBER(addr) == RGN_GATE) diff -uprN linux-2.6.32/arch/ia64/pci/pci.c linux-2.6.32.ovz/arch/ia64/pci/pci.c --- linux-2.6.32/arch/ia64/pci/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/pci/pci.c 2015-03-10 13:22:50.000000000 -0700 @@ -438,13 +438,12 @@ EXPORT_SYMBOL(pcibios_bus_to_resource); static int __devinit is_valid_resource(struct pci_dev *dev, int idx) { unsigned int i, type_mask = IORESOURCE_IO | IORESOURCE_MEM; - struct resource *devr = &dev->resource[idx]; + struct resource *devr = &dev->resource[idx], *busr; if (!dev->bus) return 0; - for (i=0; ibus->resource[i]; + pci_bus_for_each_resource(dev->bus, busr, i) { if (!busr || ((busr->flags ^ devr->flags) & type_mask)) continue; if ((devr->start) && (devr->start >= busr->start) && diff -uprN linux-2.6.32/arch/ia64/scripts/unwcheck.py linux-2.6.32.ovz/arch/ia64/scripts/unwcheck.py --- linux-2.6.32/arch/ia64/scripts/unwcheck.py 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/scripts/unwcheck.py 2015-03-10 13:21:32.000000000 -0700 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # # Usage: unwcheck.py FILE # diff -uprN linux-2.6.32/arch/ia64/xen/irq_xen.c linux-2.6.32.ovz/arch/ia64/xen/irq_xen.c --- linux-2.6.32/arch/ia64/xen/irq_xen.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/xen/irq_xen.c 2015-03-10 13:24:14.000000000 -0700 @@ -63,19 +63,19 @@ xen_free_irq_vector(int vector) } -static DEFINE_PER_CPU(int, timer_irq) = -1; -static DEFINE_PER_CPU(int, ipi_irq) = -1; -static DEFINE_PER_CPU(int, resched_irq) = -1; -static DEFINE_PER_CPU(int, cmc_irq) = -1; -static DEFINE_PER_CPU(int, cmcp_irq) = -1; -static DEFINE_PER_CPU(int, cpep_irq) = -1; +static DEFINE_PER_CPU(int, xen_timer_irq) = -1; +static DEFINE_PER_CPU(int, xen_ipi_irq) = -1; +static DEFINE_PER_CPU(int, xen_resched_irq) = -1; +static DEFINE_PER_CPU(int, xen_cmc_irq) = -1; +static DEFINE_PER_CPU(int, xen_cmcp_irq) = -1; +static DEFINE_PER_CPU(int, xen_cpep_irq) = -1; #define NAME_SIZE 15 -static DEFINE_PER_CPU(char[NAME_SIZE], timer_name); -static DEFINE_PER_CPU(char[NAME_SIZE], ipi_name); -static DEFINE_PER_CPU(char[NAME_SIZE], resched_name); -static DEFINE_PER_CPU(char[NAME_SIZE], cmc_name); -static DEFINE_PER_CPU(char[NAME_SIZE], cmcp_name); -static DEFINE_PER_CPU(char[NAME_SIZE], cpep_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_timer_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_ipi_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_resched_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_cmc_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_cmcp_name); +static DEFINE_PER_CPU(char[NAME_SIZE], xen_cpep_name); #undef NAME_SIZE struct saved_irq { @@ -144,64 +144,64 @@ __xen_register_percpu_irq(unsigned int c if (xen_slab_ready) { switch (vec) { case IA64_TIMER_VECTOR: - snprintf(per_cpu(timer_name, cpu), - sizeof(per_cpu(timer_name, cpu)), + snprintf(per_cpu(xen_timer_name, cpu), + sizeof(per_cpu(xen_timer_name, cpu)), "%s%d", action->name, cpu); irq = bind_virq_to_irqhandler(VIRQ_ITC, cpu, action->handler, action->flags, - per_cpu(timer_name, cpu), action->dev_id); - per_cpu(timer_irq, cpu) = irq; + per_cpu(xen_timer_name, cpu), action->dev_id); + per_cpu(xen_timer_irq, cpu) = irq; break; case IA64_IPI_RESCHEDULE: - snprintf(per_cpu(resched_name, cpu), - sizeof(per_cpu(resched_name, cpu)), + snprintf(per_cpu(xen_resched_name, cpu), + sizeof(per_cpu(xen_resched_name, cpu)), "%s%d", action->name, cpu); irq = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR, cpu, action->handler, action->flags, - per_cpu(resched_name, cpu), action->dev_id); - per_cpu(resched_irq, cpu) = irq; + per_cpu(xen_resched_name, cpu), action->dev_id); + per_cpu(xen_resched_irq, cpu) = irq; break; case IA64_IPI_VECTOR: - snprintf(per_cpu(ipi_name, cpu), - sizeof(per_cpu(ipi_name, cpu)), + snprintf(per_cpu(xen_ipi_name, cpu), + sizeof(per_cpu(xen_ipi_name, cpu)), "%s%d", action->name, cpu); irq = bind_ipi_to_irqhandler(XEN_IPI_VECTOR, cpu, action->handler, action->flags, - per_cpu(ipi_name, cpu), action->dev_id); - per_cpu(ipi_irq, cpu) = irq; + per_cpu(xen_ipi_name, cpu), action->dev_id); + per_cpu(xen_ipi_irq, cpu) = irq; break; case IA64_CMC_VECTOR: - snprintf(per_cpu(cmc_name, cpu), - sizeof(per_cpu(cmc_name, cpu)), + snprintf(per_cpu(xen_cmc_name, cpu), + sizeof(per_cpu(xen_cmc_name, cpu)), "%s%d", action->name, cpu); irq = bind_virq_to_irqhandler(VIRQ_MCA_CMC, cpu, - action->handler, - action->flags, - per_cpu(cmc_name, cpu), - action->dev_id); - per_cpu(cmc_irq, cpu) = irq; + action->handler, + action->flags, + per_cpu(xen_cmc_name, cpu), + action->dev_id); + per_cpu(xen_cmc_irq, cpu) = irq; break; case IA64_CMCP_VECTOR: - snprintf(per_cpu(cmcp_name, cpu), - sizeof(per_cpu(cmcp_name, cpu)), + snprintf(per_cpu(xen_cmcp_name, cpu), + sizeof(per_cpu(xen_cmcp_name, cpu)), "%s%d", action->name, cpu); irq = bind_ipi_to_irqhandler(XEN_CMCP_VECTOR, cpu, - action->handler, - action->flags, - per_cpu(cmcp_name, cpu), - action->dev_id); - per_cpu(cmcp_irq, cpu) = irq; + action->handler, + action->flags, + per_cpu(xen_cmcp_name, cpu), + action->dev_id); + per_cpu(xen_cmcp_irq, cpu) = irq; break; case IA64_CPEP_VECTOR: - snprintf(per_cpu(cpep_name, cpu), - sizeof(per_cpu(cpep_name, cpu)), + snprintf(per_cpu(xen_cpep_name, cpu), + sizeof(per_cpu(xen_cpep_name, cpu)), "%s%d", action->name, cpu); irq = bind_ipi_to_irqhandler(XEN_CPEP_VECTOR, cpu, - action->handler, - action->flags, - per_cpu(cpep_name, cpu), - action->dev_id); - per_cpu(cpep_irq, cpu) = irq; + action->handler, + action->flags, + per_cpu(xen_cpep_name, cpu), + action->dev_id); + per_cpu(xen_cpep_irq, cpu) = irq; break; case IA64_CPE_VECTOR: case IA64_MCA_RENDEZ_VECTOR: @@ -275,30 +275,33 @@ unbind_evtchn_callback(struct notifier_b if (action == CPU_DEAD) { /* Unregister evtchn. */ - if (per_cpu(cpep_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(cpep_irq, cpu), NULL); - per_cpu(cpep_irq, cpu) = -1; + if (per_cpu(xen_cpep_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_cpep_irq, cpu), + NULL); + per_cpu(xen_cpep_irq, cpu) = -1; } - if (per_cpu(cmcp_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(cmcp_irq, cpu), NULL); - per_cpu(cmcp_irq, cpu) = -1; + if (per_cpu(xen_cmcp_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_cmcp_irq, cpu), + NULL); + per_cpu(xen_cmcp_irq, cpu) = -1; } - if (per_cpu(cmc_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(cmc_irq, cpu), NULL); - per_cpu(cmc_irq, cpu) = -1; + if (per_cpu(xen_cmc_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_cmc_irq, cpu), NULL); + per_cpu(xen_cmc_irq, cpu) = -1; } - if (per_cpu(ipi_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(ipi_irq, cpu), NULL); - per_cpu(ipi_irq, cpu) = -1; + if (per_cpu(xen_ipi_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_ipi_irq, cpu), NULL); + per_cpu(xen_ipi_irq, cpu) = -1; } - if (per_cpu(resched_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(resched_irq, cpu), - NULL); - per_cpu(resched_irq, cpu) = -1; + if (per_cpu(xen_resched_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), + NULL); + per_cpu(xen_resched_irq, cpu) = -1; } - if (per_cpu(timer_irq, cpu) >= 0) { - unbind_from_irqhandler(per_cpu(timer_irq, cpu), NULL); - per_cpu(timer_irq, cpu) = -1; + if (per_cpu(xen_timer_irq, cpu) >= 0) { + unbind_from_irqhandler(per_cpu(xen_timer_irq, cpu), + NULL); + per_cpu(xen_timer_irq, cpu) = -1; } } return NOTIFY_OK; diff -uprN linux-2.6.32/arch/ia64/xen/time.c linux-2.6.32.ovz/arch/ia64/xen/time.c --- linux-2.6.32/arch/ia64/xen/time.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/ia64/xen/time.c 2015-03-10 13:24:14.000000000 -0700 @@ -34,15 +34,15 @@ #include "../kernel/fsyscall_gtod_data.h" -DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); -DEFINE_PER_CPU(unsigned long, processed_stolen_time); -DEFINE_PER_CPU(unsigned long, processed_blocked_time); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); +static DEFINE_PER_CPU(unsigned long, xen_stolen_time); +static DEFINE_PER_CPU(unsigned long, xen_blocked_time); /* taken from i386/kernel/time-xen.c */ static void xen_init_missing_ticks_accounting(int cpu) { struct vcpu_register_runstate_memory_area area; - struct vcpu_runstate_info *runstate = &per_cpu(runstate, cpu); + struct vcpu_runstate_info *runstate = &per_cpu(xen_runstate, cpu); int rc; memset(runstate, 0, sizeof(*runstate)); @@ -52,8 +52,8 @@ static void xen_init_missing_ticks_accou &area); WARN_ON(rc && rc != -ENOSYS); - per_cpu(processed_blocked_time, cpu) = runstate->time[RUNSTATE_blocked]; - per_cpu(processed_stolen_time, cpu) = runstate->time[RUNSTATE_runnable] + per_cpu(xen_blocked_time, cpu) = runstate->time[RUNSTATE_blocked]; + per_cpu(xen_stolen_time, cpu) = runstate->time[RUNSTATE_runnable] + runstate->time[RUNSTATE_offline]; } @@ -68,7 +68,7 @@ static void get_runstate_snapshot(struct BUG_ON(preemptible()); - state = &__get_cpu_var(runstate); + state = &__get_cpu_var(xen_runstate); /* * The runstate info is always updated by the hypervisor on @@ -103,12 +103,12 @@ consider_steal_time(unsigned long new_it * This function just checks and reject this effect. */ if (!time_after_eq(runstate.time[RUNSTATE_blocked], - per_cpu(processed_blocked_time, cpu))) + per_cpu(xen_blocked_time, cpu))) blocked = 0; if (!time_after_eq(runstate.time[RUNSTATE_runnable] + runstate.time[RUNSTATE_offline], - per_cpu(processed_stolen_time, cpu))) + per_cpu(xen_stolen_time, cpu))) stolen = 0; if (!time_after(delta_itm + new_itm, ia64_get_itc())) @@ -147,8 +147,8 @@ consider_steal_time(unsigned long new_it } else { local_cpu_data->itm_next = delta_itm + new_itm; } - per_cpu(processed_stolen_time, cpu) += NS_PER_TICK * stolen; - per_cpu(processed_blocked_time, cpu) += NS_PER_TICK * blocked; + per_cpu(xen_stolen_time, cpu) += NS_PER_TICK * stolen; + per_cpu(xen_blocked_time, cpu) += NS_PER_TICK * blocked; } return delta_itm; } diff -uprN linux-2.6.32/arch/Kconfig linux-2.6.32.ovz/arch/Kconfig --- linux-2.6.32/arch/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/Kconfig 2015-03-10 13:24:07.000000000 -0700 @@ -6,8 +6,6 @@ config OPROFILE tristate "OProfile system profiling (EXPERIMENTAL)" depends on PROFILING depends on HAVE_OPROFILE - depends on TRACING_SUPPORT - select TRACING select RING_BUFFER select RING_BUFFER_ALLOW_SWAP help @@ -17,20 +15,6 @@ config OPROFILE If unsure, say N. -config OPROFILE_IBS - bool "OProfile AMD IBS support (EXPERIMENTAL)" - default n - depends on OPROFILE && SMP && X86 - help - Instruction-Based Sampling (IBS) is a new profiling - technique that provides rich, precise program performance - information. IBS is introduced by AMD Family10h processors - (AMD Opteron Quad-Core processor "Barcelona") to overcome - the limitations of conventional performance counter - sampling. - - If unsure, say N. - config OPROFILE_EVENT_MULTIPLEX bool "OProfile multiplexing support (EXPERIMENTAL)" default n @@ -57,6 +41,17 @@ config KPROBES for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". +config OPTPROBES + bool "Kprobes jump optimization support (EXPERIMENTAL)" + default y + depends on KPROBES + depends on !PREEMPT + depends on HAVE_OPTPROBES + select KALLSYMS_ALL + help + This option will allow kprobes to optimize breakpoint to + a jump for reducing its overhead. + config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help @@ -83,6 +78,13 @@ config KRETPROBES def_bool y depends on KPROBES && HAVE_KRETPROBES +config USER_RETURN_NOTIFIER + bool + depends on HAVE_USER_RETURN_NOTIFIER + help + Provide a kernel-internal notification when a cpu is about to + switch to user mode. + config HAVE_IOREMAP_PROT bool @@ -92,6 +94,8 @@ config HAVE_KPROBES config HAVE_KRETPROBES bool +config HAVE_OPTPROBES + bool # # An arch should select this if it provides all these things: # @@ -126,4 +130,33 @@ config HAVE_DMA_API_DEBUG config HAVE_DEFAULT_NO_SPIN_MUTEXES bool +config HAVE_USER_RETURN_NOTIFIER + bool + +config HAVE_PERF_EVENTS_NMI + bool + help + System hardware can generate an NMI using the perf event + subsystem. Also has support for calculating CPU cycle events + to determine how many clock cycles in a given period. + +config HAVE_PERF_REGS + bool + help + Support selective register dumps for perf events. This includes + bit-mapping of each registers and a unique architecture id. + +config HAVE_PERF_USER_STACK_DUMP + bool + help + Support user stack dumps for perf event samples. This needs + access to the user stack pointer which is not unified across + architectures. + +config HAVE_ARCH_MUTEX_CPU_RELAX + bool + +config ARCH_HAVE_NMI_SAFE_CMPXCHG + bool + source "kernel/gcov/Kconfig" diff -uprN linux-2.6.32/arch/m32r/include/asm/mmzone.h linux-2.6.32.ovz/arch/m32r/include/asm/mmzone.h --- linux-2.6.32/arch/m32r/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/include/asm/mmzone.h 2015-03-10 13:22:35.000000000 -0700 @@ -14,12 +14,6 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #define node_localnr(pfn, nid) ((pfn) - NODE_DATA(nid)->node_start_pfn) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) \ -({ \ - pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_spanned_pages - 1; \ -}) #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) /* @@ -44,7 +38,7 @@ static __inline__ int pfn_to_nid(unsigne int node; for (node = 0 ; node < MAX_NUMNODES ; node++) - if (pfn >= node_start_pfn(node) && pfn <= node_end_pfn(node)) + if (pfn >= node_start_pfn(node) && pfn < node_end_pfn(node)) break; return node; diff -uprN linux-2.6.32/arch/m32r/include/asm/module.h linux-2.6.32.ovz/arch/m32r/include/asm/module.h --- linux-2.6.32/arch/m32r/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -3,8 +3,13 @@ struct mod_arch_specific { }; +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #endif /* _ASM_M32R_MODULE_H */ diff -uprN linux-2.6.32/arch/m32r/include/asm/socket.h linux-2.6.32.ovz/arch/m32r/include/asm/socket.h --- linux-2.6.32/arch/m32r/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_M32R_SOCKET_H */ diff -uprN linux-2.6.32/arch/m32r/kernel/syscall_table.S linux-2.6.32.ovz/arch/m32r/kernel/syscall_table.S --- linux-2.6.32/arch/m32r/kernel/syscall_table.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/kernel/syscall_table.S 2015-03-10 13:21:11.000000000 -0700 @@ -191,7 +191,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ diff -uprN linux-2.6.32/arch/m32r/kernel/sys_m32r.c linux-2.6.32.ovz/arch/m32r/kernel/sys_m32r.c --- linux-2.6.32/arch/m32r/kernel/sys_m32r.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m32r/kernel/sys_m32r.c 2015-03-10 13:21:11.000000000 -0700 @@ -76,30 +76,6 @@ asmlinkage int sys_tas(int __user *addr) return oldval; } -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * sys_ipc() is the de-multiplexer for the SysV IPC calls.. * diff -uprN linux-2.6.32/arch/m68k/include/asm/module.h linux-2.6.32.ovz/arch/m68k/include/asm/module.h --- linux-2.6.32/arch/m68k/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68k/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -41,8 +41,13 @@ struct mod_arch_specific { #endif /* CONFIG_MMU */ +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #endif /* _ASM_M68K_MODULE_H */ diff -uprN linux-2.6.32/arch/m68k/include/asm/socket.h linux-2.6.32.ovz/arch/m68k/include/asm/socket.h --- linux-2.6.32/arch/m68k/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68k/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/m68k/kernel/sys_m68k.c linux-2.6.32.ovz/arch/m68k/kernel/sys_m68k.c --- linux-2.6.32/arch/m68k/kernel/sys_m68k.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68k/kernel/sys_m68k.c 2015-03-10 13:21:11.000000000 -0700 @@ -29,37 +29,16 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - return do_mmap2(addr, len, prot, flags, fd, pgoff); + /* + * This is wrong for sun3 - there PAGE_SIZE is 8Kb, + * so we need to shift the argument down by 1; m68k mmap64(3) + * (in libc) expects the last argument of mmap2 in 4Kb units. + */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* @@ -90,57 +69,11 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); -out: - return error; -} - -#if 0 -struct mmap_arg_struct64 { - __u32 addr; - __u32 len; - __u32 prot; - __u32 flags; - __u64 offset; /* 64 bits */ - __u32 fd; -}; - -asmlinkage long sys_mmap64(struct mmap_arg_struct64 *arg) -{ - int error = -EFAULT; - struct file * file = NULL; - struct mmap_arg_struct64 a; - unsigned long pgoff; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - - if ((long)a.offset & ~PAGE_MASK) - return -EINVAL; - - pgoff = a.offset >> PAGE_SHIFT; - if ((a.offset >> PAGE_SHIFT) != pgoff) - return -EINVAL; - - if (!(a.flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(a.fd); - if (!file) - goto out; - } - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } -#endif struct sel_arg_struct { unsigned long n; diff -uprN linux-2.6.32/arch/m68knommu/kernel/syscalltable.S linux-2.6.32.ovz/arch/m68knommu/kernel/syscalltable.S --- linux-2.6.32/arch/m68knommu/kernel/syscalltable.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68knommu/kernel/syscalltable.S 2015-03-10 13:21:11.000000000 -0700 @@ -210,7 +210,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ diff -uprN linux-2.6.32/arch/m68knommu/kernel/sys_m68k.c linux-2.6.32.ovz/arch/m68knommu/kernel/sys_m68k.c --- linux-2.6.32/arch/m68knommu/kernel/sys_m68k.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/m68knommu/kernel/sys_m68k.c 2015-03-10 13:21:11.000000000 -0700 @@ -27,39 +27,6 @@ #include #include -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/m68k cloned Linux/i386, which didn't use to be able to @@ -88,9 +55,8 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - a.flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } diff -uprN linux-2.6.32/arch/microblaze/include/asm/unistd.h linux-2.6.32.ovz/arch/microblaze/include/asm/unistd.h --- linux-2.6.32/arch/microblaze/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/microblaze/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -382,8 +382,9 @@ #define __NR_pwritev 364 /* new */ #define __NR_rt_tgsigqueueinfo 365 /* new */ #define __NR_perf_event_open 366 /* new */ +#define __NR_recvmmsg 367 /* new */ -#define __NR_syscalls 367 +#define __NR_syscalls 368 #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff -uprN linux-2.6.32/arch/microblaze/kernel/syscall_table.S linux-2.6.32.ovz/arch/microblaze/kernel/syscall_table.S --- linux-2.6.32/arch/microblaze/kernel/syscall_table.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/microblaze/kernel/syscall_table.S 2015-03-10 13:21:18.000000000 -0700 @@ -196,7 +196,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 /* mmap2 */ + .long sys_mmap_pgoff /* mmap2 */ .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ @@ -371,3 +371,4 @@ ENTRY(sys_call_table) .long sys_ni_syscall .long sys_rt_tgsigqueueinfo /* 365 */ .long sys_perf_event_open + .long sys_recvmmsg diff -uprN linux-2.6.32/arch/microblaze/kernel/sys_microblaze.c linux-2.6.32.ovz/arch/microblaze/kernel/sys_microblaze.c --- linux-2.6.32/arch/microblaze/kernel/sys_microblaze.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/microblaze/kernel/sys_microblaze.c 2015-03-10 13:23:54.000000000 -0700 @@ -62,46 +62,14 @@ out: return error; } -asmlinkage long -sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file *file = NULL; - int ret = -EBADF; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) { - printk(KERN_INFO "no fd in mmap\r\n"); - goto out; - } - } - - down_write(¤t->mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); -out: - return ret; -} - asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, off_t pgoff) { - int err = -EINVAL; - - if (pgoff & ~PAGE_MASK) { - printk(KERN_INFO "no pagemask in mmap\r\n"); - goto out; - } + if (pgoff & ~PAGE_MASK) + return -EINVAL; - err = sys_mmap2(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); -out: - return err; + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); } /* diff -uprN linux-2.6.32/arch/mips/include/asm/errno.h linux-2.6.32.ovz/arch/mips/include/asm/errno.h --- linux-2.6.32/arch/mips/include/asm/errno.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/errno.h 2015-03-10 13:24:05.000000000 -0700 @@ -102,7 +102,7 @@ #define EWOULDBLOCK EAGAIN /* Operation would block */ #define EALREADY 149 /* Operation already in progress */ #define EINPROGRESS 150 /* Operation now in progress */ -#define ESTALE 151 /* Stale NFS file handle */ +#define ESTALE 151 /* Stale file handle */ #define ECANCELED 158 /* AIO operation canceled */ /* @@ -121,6 +121,8 @@ #define ERFKILL 167 /* Operation not possible due to RF-kill */ +#define EHWPOISON 168 /* Memory page has hardware error */ + #define EDQUOT 1133 /* Quota exceeded */ #ifdef __KERNEL__ diff -uprN linux-2.6.32/arch/mips/include/asm/mman.h linux-2.6.32.ovz/arch/mips/include/asm/mman.h --- linux-2.6.32/arch/mips/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/mman.h 2015-06-23 04:16:16.000000000 -0700 @@ -77,6 +77,13 @@ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ #define MADV_HWPOISON 100 /* poison a page for testing */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + +#define MADV_DONTDUMP 16 /* Explicity exclude from the core dump, + overrides the coredump filter bits */ +#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ + /* compatibility flags */ #define MAP_FILE 0 diff -uprN linux-2.6.32/arch/mips/include/asm/module.h linux-2.6.32.ovz/arch/mips/include/asm/module.h --- linux-2.6.32/arch/mips/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -33,11 +33,15 @@ typedef struct { } Elf64_Mips_Rela; #ifdef CONFIG_32BIT - +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr #define Elf_Addr Elf32_Addr +#define Elf_Rel Elf32_Rel +#define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #define Elf_Mips_Rel Elf32_Rel #define Elf_Mips_Rela Elf32_Rela @@ -48,11 +52,15 @@ typedef struct { #endif #ifdef CONFIG_64BIT - +#define MODULES_ARE_ELF64 #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Ehdr Elf64_Ehdr #define Elf_Addr Elf64_Addr +#define Elf_Rel Elf64_Rel +#define Elf_Rela Elf64_Rela +#define ELF_R_TYPE(X) ELF64_R_TYPE(X) +#define ELF_R_SYM(X) ELF64_R_SYM(X) #define Elf_Mips_Rel Elf64_Mips_Rel #define Elf_Mips_Rela Elf64_Mips_Rela diff -uprN linux-2.6.32/arch/mips/include/asm/socket.h linux-2.6.32.ovz/arch/mips/include/asm/socket.h --- linux-2.6.32/arch/mips/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -80,6 +80,8 @@ To add: #define SO_REUSEPORT 0x0200 /* A #define SO_TIMESTAMPING 37 #define SCM_TIMESTAMPING SO_TIMESTAMPING +#define SO_RXQ_OVFL 40 + #ifdef __KERNEL__ /** sock_type - Socket types diff -uprN linux-2.6.32/arch/mips/include/asm/statfs.h linux-2.6.32.ovz/arch/mips/include/asm/statfs.h --- linux-2.6.32/arch/mips/include/asm/statfs.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/statfs.h 2015-03-10 13:22:28.000000000 -0700 @@ -33,7 +33,8 @@ struct statfs { /* Linux specials */ __kernel_fsid_t f_fsid; long f_namelen; - long f_spare[6]; + long f_flags; + long f_spare[5]; }; #if (_MIPS_SIM == _MIPS_SIM_ABI32) || (_MIPS_SIM == _MIPS_SIM_NABI32) @@ -53,7 +54,8 @@ struct statfs64 { __u64 f_bavail; __kernel_fsid_t f_fsid; __u32 f_namelen; - __u32 f_spare[6]; + __u32 f_flags; + __u32 f_spare[5]; }; #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ @@ -73,7 +75,8 @@ struct statfs64 { /* Same as struct st /* Linux specials */ __kernel_fsid_t f_fsid; long f_namelen; - long f_spare[6]; + long f_flags; + long f_spare[5]; }; struct compat_statfs64 { @@ -88,7 +91,8 @@ struct compat_statfs64 { __u64 f_bavail; __kernel_fsid_t f_fsid; __u32 f_namelen; - __u32 f_spare[6]; + __u32 f_flags; + __u32 f_spare[5]; }; #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ diff -uprN linux-2.6.32/arch/mips/include/asm/unistd.h linux-2.6.32.ovz/arch/mips/include/asm/unistd.h --- linux-2.6.32/arch/mips/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -355,16 +355,17 @@ #define __NR_rt_tgsigqueueinfo (__NR_Linux + 332) #define __NR_perf_event_open (__NR_Linux + 333) #define __NR_accept4 (__NR_Linux + 334) +#define __NR_recvmmsg (__NR_Linux + 335) /* * Offset of the last Linux o32 flavoured syscall */ -#define __NR_Linux_syscalls 334 +#define __NR_Linux_syscalls 335 #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ #define __NR_O32_Linux 4000 -#define __NR_O32_Linux_syscalls 334 +#define __NR_O32_Linux_syscalls 335 #if _MIPS_SIM == _MIPS_SIM_ABI64 @@ -666,16 +667,17 @@ #define __NR_rt_tgsigqueueinfo (__NR_Linux + 291) #define __NR_perf_event_open (__NR_Linux + 292) #define __NR_accept4 (__NR_Linux + 293) +#define __NR_recvmmsg (__NR_Linux + 294) /* * Offset of the last Linux 64-bit flavoured syscall */ -#define __NR_Linux_syscalls 293 +#define __NR_Linux_syscalls 294 #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ #define __NR_64_Linux 5000 -#define __NR_64_Linux_syscalls 293 +#define __NR_64_Linux_syscalls 294 #if _MIPS_SIM == _MIPS_SIM_NABI32 @@ -981,16 +983,17 @@ #define __NR_rt_tgsigqueueinfo (__NR_Linux + 295) #define __NR_perf_event_open (__NR_Linux + 296) #define __NR_accept4 (__NR_Linux + 297) +#define __NR_recvmmsg (__NR_Linux + 298) /* * Offset of the last N32 flavoured syscall */ -#define __NR_Linux_syscalls 297 +#define __NR_Linux_syscalls 298 #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */ #define __NR_N32_Linux 6000 -#define __NR_N32_Linux_syscalls 297 +#define __NR_N32_Linux_syscalls 298 #ifdef __KERNEL__ diff -uprN linux-2.6.32/arch/mips/kernel/linux32.c linux-2.6.32.ovz/arch/mips/kernel/linux32.c --- linux-2.6.32/arch/mips/kernel/linux32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/linux32.c 2015-03-10 13:23:54.000000000 -0700 @@ -67,28 +67,13 @@ SYSCALL_DEFINE6(32_mmap2, unsigned long, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, pgoff) { - struct file * file = NULL; unsigned long error; error = -EINVAL; if (pgoff & (~PAGE_MASK >> 12)) goto out; - pgoff >>= PAGE_SHIFT-12; - - if (!(flags & MAP_ANONYMOUS)) { - error = -EBADF; - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); - + error = sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT-12)); out: return error; } diff -uprN linux-2.6.32/arch/mips/kernel/scall32-o32.S linux-2.6.32.ovz/arch/mips/kernel/scall32-o32.S --- linux-2.6.32/arch/mips/kernel/scall32-o32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/scall32-o32.S 2015-03-10 13:21:18.000000000 -0700 @@ -583,6 +583,7 @@ einval: li v0, -ENOSYS sys sys_rt_tgsigqueueinfo 4 sys sys_perf_event_open 5 sys sys_accept4 4 + sys sys_recvmmsg 5 .endm /* We pre-compute the number of _instruction_ bytes needed to diff -uprN linux-2.6.32/arch/mips/kernel/scall64-64.S linux-2.6.32.ovz/arch/mips/kernel/scall64-64.S --- linux-2.6.32/arch/mips/kernel/scall64-64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/scall64-64.S 2015-03-10 13:21:18.000000000 -0700 @@ -420,4 +420,5 @@ sys_call_table: PTR sys_rt_tgsigqueueinfo PTR sys_perf_event_open PTR sys_accept4 + PTR sys_recvmmsg .size sys_call_table,.-sys_call_table diff -uprN linux-2.6.32/arch/mips/kernel/scall64-n32.S linux-2.6.32.ovz/arch/mips/kernel/scall64-n32.S --- linux-2.6.32/arch/mips/kernel/scall64-n32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/scall64-n32.S 2015-03-10 13:21:18.000000000 -0700 @@ -418,4 +418,5 @@ EXPORT(sysn32_call_table) PTR compat_sys_rt_tgsigqueueinfo /* 5295 */ PTR sys_perf_event_open PTR sys_accept4 + PTR compat_sys_recvmmsg .size sysn32_call_table,.-sysn32_call_table diff -uprN linux-2.6.32/arch/mips/kernel/scall64-o32.S linux-2.6.32.ovz/arch/mips/kernel/scall64-o32.S --- linux-2.6.32/arch/mips/kernel/scall64-o32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/scall64-o32.S 2015-03-10 13:21:18.000000000 -0700 @@ -538,4 +538,5 @@ sys_call_table: PTR compat_sys_rt_tgsigqueueinfo PTR sys_perf_event_open PTR sys_accept4 + PTR compat_sys_recvmmsg .size sys_call_table,.-sys_call_table diff -uprN linux-2.6.32/arch/mips/kernel/syscall.c linux-2.6.32.ovz/arch/mips/kernel/syscall.c --- linux-2.6.32/arch/mips/kernel/syscall.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/kernel/syscall.c 2015-03-10 13:23:54.000000000 -0700 @@ -93,7 +93,8 @@ unsigned long arch_get_unmapped_area(str * We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && (addr & shm_align_mask)) + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) return -EINVAL; return addr; } @@ -129,31 +130,6 @@ unsigned long arch_get_unmapped_area(str } } -/* common code for old and new mmaps */ -static inline unsigned long -do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, unsigned long pgoff) -{ - unsigned long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - SYSCALL_DEFINE6(mips_mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, off_t, offset) @@ -164,7 +140,7 @@ SYSCALL_DEFINE6(mips_mmap, unsigned long if (offset & ~PAGE_MASK) goto out; - result = do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + result = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); out: return result; @@ -177,7 +153,7 @@ SYSCALL_DEFINE6(mips_mmap2, unsigned lon if (pgoff & (~PAGE_MASK >> 12)) return -EINVAL; - return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12)); + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT-12)); } save_static_function(sys_fork); diff -uprN linux-2.6.32/arch/mips/mm/hugetlbpage.c linux-2.6.32.ovz/arch/mips/mm/hugetlbpage.c --- linux-2.6.32/arch/mips/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/mm/hugetlbpage.c 2015-03-10 13:23:25.000000000 -0700 @@ -24,7 +24,7 @@ #include pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, - unsigned long sz) + unsigned long sz, bool *shared) { pgd_t *pgd; pud_t *pud; diff -uprN linux-2.6.32/arch/mips/mm/init.c linux-2.6.32.ovz/arch/mips/mm/init.c --- linux-2.6.32/arch/mips/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/mm/init.c 2015-03-10 13:21:18.000000000 -0700 @@ -298,7 +298,7 @@ void __init fixrange_init(unsigned long } #ifndef CONFIG_NEED_MULTIPLE_NODES -static int __init page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { int i; diff -uprN linux-2.6.32/arch/mips/txx9/generic/setup.c linux-2.6.32.ovz/arch/mips/txx9/generic/setup.c --- linux-2.6.32/arch/mips/txx9/generic/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mips/txx9/generic/setup.c 2015-03-10 13:21:32.000000000 -0700 @@ -930,7 +930,7 @@ struct txx9_sramc_sysdev { void __iomem *base; }; -static ssize_t txx9_sram_read(struct kobject *kobj, +static ssize_t txx9_sram_read(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t size) { @@ -945,7 +945,7 @@ static ssize_t txx9_sram_read(struct kob return size; } -static ssize_t txx9_sram_write(struct kobject *kobj, +static ssize_t txx9_sram_write(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, char *buf, loff_t pos, size_t size) { diff -uprN linux-2.6.32/arch/mn10300/include/asm/mman.h linux-2.6.32.ovz/arch/mn10300/include/asm/mman.h --- linux-2.6.32/arch/mn10300/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mn10300/include/asm/mman.h 2015-03-10 13:21:11.000000000 -0700 @@ -1 +1,6 @@ #include + +#define MIN_MAP_ADDR PAGE_SIZE /* minimum fixed mmap address */ + +#define arch_mmap_check(addr, len, flags) \ + (((flags) & MAP_FIXED && (addr) < MIN_MAP_ADDR) ? -EINVAL : 0) diff -uprN linux-2.6.32/arch/mn10300/include/asm/socket.h linux-2.6.32.ovz/arch/mn10300/include/asm/socket.h --- linux-2.6.32/arch/mn10300/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mn10300/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -60,4 +60,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/mn10300/kernel/entry.S linux-2.6.32.ovz/arch/mn10300/kernel/entry.S --- linux-2.6.32/arch/mn10300/kernel/entry.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mn10300/kernel/entry.S 2015-03-10 13:21:11.000000000 -0700 @@ -578,7 +578,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long sys_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ diff -uprN linux-2.6.32/arch/mn10300/kernel/sys_mn10300.c linux-2.6.32.ovz/arch/mn10300/kernel/sys_mn10300.c --- linux-2.6.32/arch/mn10300/kernel/sys_mn10300.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/mn10300/kernel/sys_mn10300.c 2015-03-10 13:21:11.000000000 -0700 @@ -23,47 +23,13 @@ #include -#define MIN_MAP_ADDR PAGE_SIZE /* minimum fixed mmap address */ - -/* - * memory mapping syscall - */ -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file *file = NULL; - long error = -EINVAL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - if (flags & MAP_FIXED && addr < MIN_MAP_ADDR) - goto out; - - error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage long old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) { if (offset & ~PAGE_MASK) return -EINVAL; - return sys_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + return sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); } struct sel_arg_struct { diff -uprN linux-2.6.32/arch/parisc/hpux/sys_hpux.c linux-2.6.32.ovz/arch/parisc/hpux/sys_hpux.c --- linux-2.6.32/arch/parisc/hpux/sys_hpux.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/hpux/sys_hpux.c 2015-03-10 13:23:54.000000000 -0700 @@ -145,7 +145,7 @@ static int hpux_ustat(dev_t dev, struct s = user_get_super(dev); if (s == NULL) goto out; - err = vfs_statfs(s->s_root, &sbuf); + err = statfs_by_dentry(s->s_root, &sbuf); drop_super(s); if (err) goto out; @@ -186,12 +186,12 @@ struct hpux_statfs { int16_t f_pad; }; -static int vfs_statfs_hpux(struct dentry *dentry, struct hpux_statfs *buf) +static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf) { struct kstatfs st; int retval; - retval = vfs_statfs(dentry, &st); + retval = vfs_statfs(path, &st); if (retval) return retval; @@ -219,7 +219,7 @@ asmlinkage long hpux_statfs(const char _ error = user_path(pathname, &path); if (!error) { struct hpux_statfs tmp; - error = vfs_statfs_hpux(path.dentry, &tmp); + error = do_statfs_hpux(&path, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; path_put(&path); @@ -237,7 +237,7 @@ asmlinkage long hpux_fstatfs(unsigned in file = fget(fd); if (!file) goto out; - error = vfs_statfs_hpux(file->f_path.dentry, &tmp); + error = do_statfs_hpux(&file->f_path, &tmp); if (!error && copy_to_user(buf, &tmp, sizeof(tmp))) error = -EFAULT; fput(file); diff -uprN linux-2.6.32/arch/parisc/include/asm/errno.h linux-2.6.32.ovz/arch/parisc/include/asm/errno.h --- linux-2.6.32/arch/parisc/include/asm/errno.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/errno.h 2015-03-10 13:24:05.000000000 -0700 @@ -37,7 +37,7 @@ #define EBADMSG 67 /* Not a data message */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ -#define ESTALE 70 /* Stale NFS file handle */ +#define ESTALE 70 /* Stale file handle */ #define EREMOTE 71 /* Object is remote */ #define EOVERFLOW 72 /* Value too large for defined data type */ @@ -122,4 +122,6 @@ #define ERFKILL 256 /* Operation not possible due to RF-kill */ +#define EHWPOISON 257 /* Memory page has hardware error */ + #endif diff -uprN linux-2.6.32/arch/parisc/include/asm/mman.h linux-2.6.32.ovz/arch/parisc/include/asm/mman.h --- linux-2.6.32/arch/parisc/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/mman.h 2015-06-23 04:16:16.000000000 -0700 @@ -59,6 +59,13 @@ #define MADV_MERGEABLE 65 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 66 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 67 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 68 /* Not worth backing with hugepages */ + +#define MADV_DONTDUMP 69 /* Explicity exclude from the core dump, + overrides the coredump filter bits */ +#define MADV_DODUMP 70 /* Clear the MADV_NODUMP flag */ + /* compatibility flags */ #define MAP_FILE 0 #define MAP_VARIABLE 0 diff -uprN linux-2.6.32/arch/parisc/include/asm/mmzone.h linux-2.6.32.ovz/arch/parisc/include/asm/mmzone.h --- linux-2.6.32/arch/parisc/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/mmzone.h 2015-03-10 13:22:35.000000000 -0700 @@ -14,13 +14,6 @@ extern struct node_map_data node_data[]; #define NODE_DATA(nid) (&node_data[nid].pg_data) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) \ -({ \ - pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ -}) - /* We have these possible memory map layouts: * Astro: 0-3.75, 67.75-68, 4-64 * zx1: 0-1, 257-260, 4-256 diff -uprN linux-2.6.32/arch/parisc/include/asm/module.h linux-2.6.32.ovz/arch/parisc/include/asm/module.h --- linux-2.6.32/arch/parisc/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -4,17 +4,25 @@ * This file contains the parisc architecture specific module code. */ #ifdef CONFIG_64BIT +#define MODULES_ARE_ELF64 #define Elf_Shdr Elf64_Shdr #define Elf_Sym Elf64_Sym #define Elf_Ehdr Elf64_Ehdr #define Elf_Addr Elf64_Addr +#define Elf_Rel Elf64_Rel #define Elf_Rela Elf64_Rela +#define ELF_R_TYPE(X) ELF64_R_TYPE(X) +#define ELF_R_SYM(X) ELF64_R_SYM(X) #else +#define MODULES_ARE_ELF32 #define Elf_Shdr Elf32_Shdr #define Elf_Sym Elf32_Sym #define Elf_Ehdr Elf32_Ehdr #define Elf_Addr Elf32_Addr +#define Elf_Rel Elf32_Rel #define Elf_Rela Elf32_Rela +#define ELF_R_TYPE(X) ELF32_R_TYPE(X) +#define ELF_R_SYM(X) ELF32_R_SYM(X) #endif struct unwind_table; diff -uprN linux-2.6.32/arch/parisc/include/asm/socket.h linux-2.6.32.ovz/arch/parisc/include/asm/socket.h --- linux-2.6.32/arch/parisc/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -59,6 +59,8 @@ #define SO_TIMESTAMPING 0x4020 #define SCM_TIMESTAMPING SO_TIMESTAMPING +#define SO_RXQ_OVFL 0x4021 + /* O_NONBLOCK clashes with the bits used for socket types. Therefore we * have to define SOCK_NONBLOCK to a different value here. */ diff -uprN linux-2.6.32/arch/parisc/include/asm/unistd.h linux-2.6.32.ovz/arch/parisc/include/asm/unistd.h --- linux-2.6.32/arch/parisc/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -811,8 +811,9 @@ #define __NR_pwritev (__NR_Linux + 316) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 317) #define __NR_perf_event_open (__NR_Linux + 318) +#define __NR_recvmmsg (__NR_Linux + 319) -#define __NR_Linux_syscalls (__NR_perf_event_open + 1) +#define __NR_Linux_syscalls (__NR_recvmmsg + 1) #define __IGNORE_select /* newselect */ diff -uprN linux-2.6.32/arch/parisc/kernel/syscall_table.S linux-2.6.32.ovz/arch/parisc/kernel/syscall_table.S --- linux-2.6.32/arch/parisc/kernel/syscall_table.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/kernel/syscall_table.S 2015-03-10 13:21:18.000000000 -0700 @@ -417,6 +417,7 @@ ENTRY_COMP(pwritev) ENTRY_COMP(rt_tgsigqueueinfo) ENTRY_SAME(perf_event_open) + ENTRY_COMP(recvmmsg) /* Nothing yet */ diff -uprN linux-2.6.32/arch/parisc/kernel/sys_parisc32.c linux-2.6.32.ovz/arch/parisc/kernel/sys_parisc32.c --- linux-2.6.32/arch/parisc/kernel/sys_parisc32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/kernel/sys_parisc32.c 2015-03-10 13:23:54.000000000 -0700 @@ -26,13 +26,7 @@ #include #include #include -#include #include -#include -#include -#include -#include -#include #include #include #include diff -uprN linux-2.6.32/arch/parisc/kernel/sys_parisc.c linux-2.6.32.ovz/arch/parisc/kernel/sys_parisc.c --- linux-2.6.32/arch/parisc/kernel/sys_parisc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/kernel/sys_parisc.c 2015-03-10 13:21:11.000000000 -0700 @@ -110,37 +110,14 @@ unsigned long arch_get_unmapped_area(str return addr; } -static unsigned long do_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, unsigned long fd, - unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long error = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file != NULL) - fput(file); -out: - return error; -} - asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { /* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have. */ - return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT - 12)); + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, @@ -148,7 +125,8 @@ asmlinkage unsigned long sys_mmap(unsign unsigned long offset) { if (!(offset & ~PAGE_MASK)) { - return do_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + return sys_mmap_pgoff(addr, len, prot, flags, fd, + offset >> PAGE_SHIFT); } else { return -EINVAL; } diff -uprN linux-2.6.32/arch/parisc/mm/init.c linux-2.6.32.ovz/arch/parisc/mm/init.c --- linux-2.6.32/arch/parisc/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/parisc/mm/init.c 2015-03-10 13:22:29.000000000 -0700 @@ -543,7 +543,7 @@ void __init mem_init(void) unsigned long *empty_zero_page __read_mostly; EXPORT_SYMBOL(empty_zero_page); -void show_mem(void) +void show_mem(unsigned int filter) { int i,free = 0,total = 0,reserved = 0; int shared = 0, cached = 0; diff -uprN linux-2.6.32/arch/powerpc/include/asm/asm-compat.h linux-2.6.32.ovz/arch/powerpc/include/asm/asm-compat.h --- linux-2.6.32/arch/powerpc/include/asm/asm-compat.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/asm-compat.h 2015-03-10 13:21:31.000000000 -0700 @@ -2,6 +2,7 @@ #define _ASM_POWERPC_ASM_COMPAT_H #include +#include #ifdef __ASSEMBLY__ # define stringify_in_c(...) __VA_ARGS__ @@ -24,7 +25,7 @@ #define PPC_LONG stringify_in_c(.llong) #define PPC_LONG_ALIGN stringify_in_c(.balign 8) #define PPC_TLNEI stringify_in_c(tdnei) -#define PPC_LLARX stringify_in_c(ldarx) +#define PPC_LLARX(t, a, b, eh) PPC_LDARX(t, a, b, eh) #define PPC_STLCX stringify_in_c(stdcx.) #define PPC_CNTLZL stringify_in_c(cntlzd) @@ -46,7 +47,7 @@ #define PPC_LONG stringify_in_c(.long) #define PPC_LONG_ALIGN stringify_in_c(.balign 4) #define PPC_TLNEI stringify_in_c(twnei) -#define PPC_LLARX stringify_in_c(lwarx) +#define PPC_LLARX(t, a, b, eh) PPC_LWARX(t, a, b, eh) #define PPC_STLCX stringify_in_c(stwcx.) #define PPC_CNTLZL stringify_in_c(cntlzw) #define PPC_MTOCRF stringify_in_c(mtcrf) diff -uprN linux-2.6.32/arch/powerpc/include/asm/bitops.h linux-2.6.32.ovz/arch/powerpc/include/asm/bitops.h --- linux-2.6.32/arch/powerpc/include/asm/bitops.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/bitops.h 2015-03-10 13:24:00.000000000 -0700 @@ -65,7 +65,7 @@ static __inline__ void fn(unsigned long unsigned long *p = (unsigned long *)_p; \ __asm__ __volatile__ ( \ prefix \ -"1:" PPC_LLARX "%0,0,%3\n" \ +"1:" PPC_LLARX(%0,0,%3,0) "\n" \ stringify_in_c(op) "%0,%0,%2\n" \ PPC405_ERR77(0,%3) \ PPC_STLCX "%0,0,%3\n" \ @@ -103,31 +103,31 @@ static __inline__ void change_bit(int nr /* Like DEFINE_BITOP(), with changes to the arguments to 'op' and the output * operands. */ -#define DEFINE_TESTOP(fn, op, prefix, postfix) \ -static __inline__ unsigned long fn( \ - unsigned long mask, \ - volatile unsigned long *_p) \ -{ \ - unsigned long old, t; \ - unsigned long *p = (unsigned long *)_p; \ - __asm__ __volatile__ ( \ - prefix \ -"1:" PPC_LLARX "%0,0,%3\n" \ - stringify_in_c(op) "%1,%0,%2\n" \ - PPC405_ERR77(0,%3) \ - PPC_STLCX "%1,0,%3\n" \ - "bne- 1b\n" \ - postfix \ - : "=&r" (old), "=&r" (t) \ - : "r" (mask), "r" (p) \ - : "cc", "memory"); \ - return (old & mask); \ -} - -DEFINE_TESTOP(test_and_set_bits, or, LWSYNC_ON_SMP, ISYNC_ON_SMP) -DEFINE_TESTOP(test_and_set_bits_lock, or, "", ISYNC_ON_SMP) -DEFINE_TESTOP(test_and_clear_bits, andc, LWSYNC_ON_SMP, ISYNC_ON_SMP) -DEFINE_TESTOP(test_and_change_bits, xor, LWSYNC_ON_SMP, ISYNC_ON_SMP) +#define DEFINE_TESTOP(fn, op, prefix, postfix, eh) \ +static __inline__ unsigned long fn( \ + unsigned long mask, \ + volatile unsigned long *_p) \ +{ \ + unsigned long old, t; \ + unsigned long *p = (unsigned long *)_p; \ + __asm__ __volatile__ ( \ + prefix \ +"1:" PPC_LLARX(%0,0,%3,eh) "\n" \ + stringify_in_c(op) "%1,%0,%2\n" \ + PPC405_ERR77(0,%3) \ + PPC_STLCX "%1,0,%3\n" \ + "bne- 1b\n" \ + postfix \ + : "=&r" (old), "=&r" (t) \ + : "r" (mask), "r" (p) \ + : "cc", "memory"); \ + return (old & mask); \ +} + +DEFINE_TESTOP(test_and_set_bits, or, LWSYNC_ON_SMP, ISYNC_ON_SMP, 0) +DEFINE_TESTOP(test_and_set_bits_lock, or, "", ISYNC_ON_SMP, 1) +DEFINE_TESTOP(test_and_clear_bits, andc, LWSYNC_ON_SMP, ISYNC_ON_SMP, 0) +DEFINE_TESTOP(test_and_change_bits, xor, LWSYNC_ON_SMP, ISYNC_ON_SMP, 0) static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) @@ -333,6 +333,33 @@ unsigned long generic_find_next_le_bit(c #include +/* Compat macros for RHEL6 */ +#define find_next_zero_bit_le(addr, size, offset) \ + generic_find_next_zero_le_bit(addr, size, offset) +#define find_next_bit_le(addr, size, offset) \ + generic_find_next_le_bit(addr, size, offset) +#define __set_bit_le(nr,addr) \ + __set_le_bit(nr, addr) +#define __clear_bit_le(nr, addr) \ + __clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) + +#define test_bit_le(nr, addr) \ + test_le_bit((nr), (addr)) +#define set_bit_le(nr, addr) \ + set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) +#define clear_bit_le(nr, addr) \ + clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) + +#define test_and_set_bit_le(nr, addr) \ + test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) +#define test_and_clear_bit_le(nr, addr) \ + test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) + +#define __test_and_set_bit_le(nr, addr) \ + __test_and_set_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) +#define __test_and_clear_bit_le(nr, addr) \ + __test_and_clear_bit((nr) ^ BITOP_LE_SWIZZLE, (addr)) + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_BITOPS_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/bug.h linux-2.6.32.ovz/arch/powerpc/include/asm/bug.h --- linux-2.6.32/arch/powerpc/include/asm/bug.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/bug.h 2015-03-10 13:21:40.000000000 -0700 @@ -85,12 +85,12 @@ } \ } while (0) -#define __WARN() do { \ +#define __WARN_TAINT(taint) do { \ __asm__ __volatile__( \ "1: twi 31,0,0\n" \ _EMIT_BUG_ENTRY \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (BUGFLAG_WARNING), \ + "i" (BUGFLAG_TAINT(taint)), \ "i" (sizeof(struct bug_entry))); \ } while (0) @@ -104,7 +104,7 @@ "1: "PPC_TLNEI" %4,0\n" \ _EMIT_BUG_ENTRY \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (BUGFLAG_WARNING), \ + "i" (BUGFLAG_TAINT(TAINT_WARN)), \ "i" (sizeof(struct bug_entry)), \ "r" (__ret_warn_on)); \ } \ diff -uprN linux-2.6.32/arch/powerpc/include/asm/checksum.h linux-2.6.32.ovz/arch/powerpc/include/asm/checksum.h --- linux-2.6.32/arch/powerpc/include/asm/checksum.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/checksum.h 2015-03-10 13:23:55.000000000 -0700 @@ -52,12 +52,22 @@ extern __wsum csum_partial(const void *b extern __wsum csum_partial_copy_generic(const void *src, void *dst, int len, __wsum sum, int *src_err, int *dst_err); + +#ifdef __powerpc64__ +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER +extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr); +#define HAVE_CSUM_COPY_USER +extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, + int len, __wsum sum, int *err_ptr); +#else /* * the same as csum_partial, but copies from src to dst while it * checksums. */ #define csum_partial_copy_from_user(src, dst, len, sum, errp) \ csum_partial_copy_generic((__force const void *)(src), (dst), (len), (sum), (errp), NULL) +#endif #define csum_partial_copy_nocheck(src, dst, len, sum) \ csum_partial_copy_generic((src), (dst), (len), (sum), NULL, NULL) diff -uprN linux-2.6.32/arch/powerpc/include/asm/compat.h linux-2.6.32.ovz/arch/powerpc/include/asm/compat.h --- linux-2.6.32/arch/powerpc/include/asm/compat.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/compat.h 2015-03-10 13:21:44.000000000 -0700 @@ -133,7 +133,7 @@ static inline compat_uptr_t ptr_to_compa return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = current->thread.regs; unsigned long usp = regs->gpr[1]; diff -uprN linux-2.6.32/arch/powerpc/include/asm/cputable.h linux-2.6.32.ovz/arch/powerpc/include/asm/cputable.h --- linux-2.6.32/arch/powerpc/include/asm/cputable.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/cputable.h 2015-03-10 13:22:52.000000000 -0700 @@ -195,6 +195,8 @@ extern const char *powerpc_base_platform #define CPU_FTR_SAO LONG_ASM_CONST(0x0020000000000000) #define CPU_FTR_CP_USE_DCBTZ LONG_ASM_CONST(0x0040000000000000) #define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000) +#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000) +#define CPU_FTR_VMX_COPY LONG_ASM_CONST(0x2000000000000000) #ifndef __ASSEMBLY__ @@ -409,7 +411,7 @@ extern const char *powerpc_base_platform CPU_FTR_MMCRA | CPU_FTR_SMT | \ CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ - CPU_FTR_DSCR | CPU_FTR_SAO) + CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT | CPU_FTR_VMX_COPY) #define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ diff -uprN linux-2.6.32/arch/powerpc/include/asm/cputime.h linux-2.6.32.ovz/arch/powerpc/include/asm/cputime.h --- linux-2.6.32/arch/powerpc/include/asm/cputime.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/cputime.h 2015-03-10 13:23:17.000000000 -0700 @@ -124,10 +124,11 @@ static inline u64 cputime64_to_jiffies64 return mulhdu(ct, __cputime_jiffies_factor); } +extern u64 __cputime_msec_factor; + /* * Convert cputime <-> milliseconds */ -extern u64 __cputime_msec_factor; static inline unsigned long cputime_to_msecs(const cputime_t ct) { @@ -152,6 +153,36 @@ static inline cputime_t msecs_to_cputime } /* + * Convert cputime <-> microseconds + */ + +extern u64 __cputime_usec_factor; + +static inline unsigned long cputime_to_usecs(const cputime_t ct) +{ + return mulhdu(ct, __cputime_usec_factor); +} + +static inline cputime_t usecs_to_cputime(const unsigned long us) +{ + cputime_t ct; + unsigned long sec; + + /* have to be a little careful about overflow */ + ct = us % 1000000; + sec = us / 1000000; + if (ct) { + ct *= tb_ticks_per_sec; + do_div(ct, 1000000); + } + if (sec) + ct += (cputime_t) sec * tb_ticks_per_sec; + return ct; +} + +#define usecs_to_cputime64(us) usecs_to_cputime(us) + +/* * Convert cputime <-> seconds */ extern u64 __cputime_sec_factor; diff -uprN linux-2.6.32/arch/powerpc/include/asm/crash.h linux-2.6.32.ovz/arch/powerpc/include/asm/crash.h --- linux-2.6.32/arch/powerpc/include/asm/crash.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/crash.h 2015-03-10 13:23:09.000000000 -0700 @@ -0,0 +1,51 @@ +#ifndef _PPC64_CRASH_H +#define _PPC64_CRASH_H + +#ifdef __KERNEL__ + +#include +#include + + +static inline void *map_virtual(u64 offset, struct page **pp) +{ + struct page *page; + unsigned long pfn; + void *vaddr; + + pfn = (unsigned long)(offset >> PAGE_SHIFT); + + if (!page_is_ram(pfn)) { + printk(KERN_INFO + "crash memory driver: !page_is_ram(pfn: %lx)\n", pfn); + return NULL; + } + + if (!pfn_valid(pfn)) { + printk(KERN_INFO + "crash memory driver: invalid pfn: %lx )\n", pfn); + return NULL; + } + + page = pfn_to_page(pfn); + + vaddr = kmap(page); + if (!vaddr) { + printk(KERN_INFO + "crash memory driver: pfn: %lx kmap(page: %lx) failed\n", + pfn, (unsigned long) page); + return NULL; + } + + *pp = page; + return (vaddr + (offset & (PAGE_SIZE - 1))); +} + +static inline void unmap_virtual(struct page *page) +{ + kunmap(page); +} + +#endif /* __KERNEL__ */ + +#endif /* _PPC64_CRASH_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/dma-mapping.h linux-2.6.32.ovz/arch/powerpc/include/asm/dma-mapping.h --- linux-2.6.32/arch/powerpc/include/asm/dma-mapping.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/dma-mapping.h 2015-03-10 13:22:05.000000000 -0700 @@ -130,19 +130,7 @@ static inline int dma_supported(struct d /* We have our own implementation of pci_set_dma_mask() */ #define HAVE_ARCH_PCI_SET_DMA_MASK -static inline int dma_set_mask(struct device *dev, u64 dma_mask) -{ - struct dma_map_ops *dma_ops = get_dma_ops(dev); - - if (unlikely(dma_ops == NULL)) - return -EIO; - if (dma_ops->set_dma_mask != NULL) - return dma_ops->set_dma_mask(dev, dma_mask); - if (!dev->dma_mask || !dma_supported(dev, dma_mask)) - return -EIO; - *dev->dma_mask = dma_mask; - return 0; -} +extern int dma_set_mask(struct device *dev, u64 dma_mask); static inline void *dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) diff -uprN linux-2.6.32/arch/powerpc/include/asm/elf.h linux-2.6.32.ovz/arch/powerpc/include/asm/elf.h --- linux-2.6.32/arch/powerpc/include/asm/elf.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/elf.h 2015-06-23 04:16:16.000000000 -0700 @@ -236,14 +236,10 @@ typedef elf_vrregset_t elf_fpxregset_t; #ifdef __powerpc64__ # define SET_PERSONALITY(ex) \ do { \ - unsigned long new_flags = 0; \ if ((ex).e_ident[EI_CLASS] == ELFCLASS32) \ - new_flags = _TIF_32BIT; \ - if ((current_thread_info()->flags & _TIF_32BIT) \ - != new_flags) \ - set_thread_flag(TIF_ABI_PENDING); \ + set_thread_flag(TIF_32BIT); \ else \ - clear_thread_flag(TIF_ABI_PENDING); \ + clear_thread_flag(TIF_32BIT); \ if (personality(current->personality) != PER_LINUX32) \ set_personality(PER_LINUX | \ (current->personality & (~PER_MASK))); \ @@ -270,6 +266,7 @@ extern int ucache_bsize; /* vDSO has arch_setup_additional_pages */ #define ARCH_HAS_SETUP_ADDITIONAL_PAGES struct linux_binprm; +export struct page *vdso32_pages[1]; extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define VDSO_AUX_ENT(a,b) NEW_AUX_ENT(a,b); diff -uprN linux-2.6.32/arch/powerpc/include/asm/exception-64s.h linux-2.6.32.ovz/arch/powerpc/include/asm/exception-64s.h --- linux-2.6.32/arch/powerpc/include/asm/exception-64s.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/exception-64s.h 2015-03-10 13:21:47.000000000 -0700 @@ -137,7 +137,8 @@ li r10,0; \ ld r11,exception_marker@toc(r2); \ std r10,RESULT(r1); /* clear regs->result */ \ - std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ + std r11,STACK_FRAME_OVERHEAD-16(r1); /* mark the frame */ \ + ACCOUNT_STOLEN_TIME /* * Exception vectors. diff -uprN linux-2.6.32/arch/powerpc/include/asm/fadump.h linux-2.6.32.ovz/arch/powerpc/include/asm/fadump.h --- linux-2.6.32/arch/powerpc/include/asm/fadump.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/fadump.h 2015-03-10 13:23:44.000000000 -0700 @@ -0,0 +1,223 @@ +/* + * Firmware Assisted dump header file. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2011 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#ifndef __PPC64_FA_DUMP_H__ +#define __PPC64_FA_DUMP_H__ + +#ifdef CONFIG_FA_DUMP + +/* + * The RMA region will be saved for later dumping when kernel crashes. + * RMA is Real Mode Area, the first block of logical memory address owned + * by logical partition, containing the storage that may be accessed with + * translate off. + */ +#define RMA_START 0x0 +#define RMA_END (lmb.rmo_size) + +/* + * On some Power systems where RMO is 128MB, it still requires minimum of + * 256MB for kernel to boot successfully. When kdump infrastructure is + * configured to save vmcore over network, we run into OOM issue while + * loading modules related to network setup. Hence we need aditional 64M + * of memory to avoid OOM issue. + */ +#define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + + (0x1UL << 26)) + +#define lmb_num_regions(lmb_type) (lmb.lmb_type.cnt) + +#define for_each_lmb(region_iter, lmb_type, reg) \ + for (region_iter = 0, reg = &lmb.lmb_type.region[region_iter]; \ + region_iter < lmb.lmb_type.cnt; \ + region_iter++, reg = &lmb.lmb_type.region[region_iter]) + +#ifndef ELF_CORE_EFLAGS +#define ELF_CORE_EFLAGS 0 +#endif + +/* Firmware provided dump sections */ +#define FADUMP_CPU_STATE_DATA 0x0001 +#define FADUMP_HPTE_REGION 0x0002 +#define FADUMP_REAL_MODE_REGION 0x0011 + +/* Dump request flag */ +#define FADUMP_REQUEST_FLAG 0x00000001 + +/* FAD commands */ +#define FADUMP_REGISTER 1 +#define FADUMP_UNREGISTER 2 +#define FADUMP_INVALIDATE 3 + +/* Dump status flag */ +#define FADUMP_ERROR_FLAG 0x2000 + +#define FADUMP_CPU_ID_MASK ((1UL << 32) - 1) + +#define CPU_UNKNOWN (~((u32)0)) + +/* Utility macros */ +#define SKIP_TO_NEXT_CPU(reg_entry) \ +({ \ + while (reg_entry->reg_id != REG_ID("CPUEND")) \ + reg_entry++; \ + reg_entry++; \ +}) + +/* Kernel Dump section info */ +struct fadump_section { + u32 request_flag; + u16 source_data_type; + u16 error_flags; + u64 source_address; + u64 source_len; + u64 bytes_dumped; + u64 destination_address; +}; + +/* ibm,configure-kernel-dump header. */ +struct fadump_section_header { + u32 dump_format_version; + u16 dump_num_sections; + u16 dump_status_flag; + u32 offset_first_dump_section; + + /* Fields for disk dump option. */ + u32 dd_block_size; + u64 dd_block_offset; + u64 dd_num_blocks; + u32 dd_offset_disk_path; + + /* Maximum time allowed to prevent an automatic dump-reboot. */ + u32 max_time_auto; +}; + +/* + * Firmware Assisted dump memory structure. This structure is required for + * registering future kernel dump with power firmware through rtas call. + * + * No disk dump option. Hence disk dump path string section is not included. + */ +struct fadump_mem_struct { + struct fadump_section_header header; + + /* Kernel dump sections */ + struct fadump_section cpu_state_data; + struct fadump_section hpte_region; + struct fadump_section rmr_region; +}; + +/* Firmware-assisted dump configuration details. */ +struct fw_dump { + unsigned long cpu_state_data_size; + unsigned long hpte_region_size; + unsigned long boot_memory_size; + unsigned long reserve_dump_area_start; + unsigned long reserve_dump_area_size; + /* cmd line option during boot */ + unsigned long reserve_bootvar; + + unsigned long fadumphdr_addr; + unsigned long cpu_notes_buf; + unsigned long cpu_notes_buf_size; + + int ibm_configure_kernel_dump; + + unsigned long fadump_enabled:1; + unsigned long fadump_supported:1; + unsigned long dump_active:1; + unsigned long dump_registered:1; +}; + +/* + * Copy the ascii values for first 8 characters from a string into u64 + * variable at their respective indexes. + * e.g. + * The string "FADMPINF" will be converted into 0x4641444d50494e46 + */ +static inline u64 str_to_u64(const char *str) +{ + u64 val = 0; + int i; + + for (i = 0; i < sizeof(val); i++) + val = (*str) ? (val << 8) | *str++ : val << 8; + return val; +} +#define STR_TO_HEX(x) str_to_u64(x) +#define REG_ID(x) str_to_u64(x) + +#define FADUMP_CRASH_INFO_MAGIC STR_TO_HEX("FADMPINF") +#define REGSAVE_AREA_MAGIC STR_TO_HEX("REGSAVE") + +/* The firmware-assisted dump format. + * + * The register save area is an area in the partition's memory used to preserve + * the register contents (CPU state data) for the active CPUs during a firmware + * assisted dump. The dump format contains register save area header followed + * by register entries. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". + */ + +/* Register save area header. */ +struct fadump_reg_save_area_header { + u64 magic_number; + u32 version; + u32 num_cpu_offset; +}; + +/* Register entry. */ +struct fadump_reg_entry { + u64 reg_id; + u64 reg_value; +}; + +/* fadump crash info structure */ +struct fadump_crash_info_header { + u64 magic_number; + u64 elfcorehdr_addr; + u32 crashing_cpu; + struct pt_regs regs; + struct cpumask cpu_online_mask; +}; + +/* Crash memory ranges */ +#define INIT_CRASHMEM_RANGES (MAX_LMB_REGIONS + 2) + +struct fad_crash_memory_ranges { + unsigned long long base; + unsigned long long size; +}; + +extern int early_init_dt_scan_fw_dump(unsigned long node, + const char *uname, int depth, void *data); +extern int fadump_reserve_mem(void); +extern int setup_fadump(void); +extern int is_fadump_active(void); +extern void crash_fadump(struct pt_regs *, const char *); +extern void fadump_cleanup(void); + +extern void vmcore_cleanup(void); +#else /* CONFIG_FA_DUMP */ +static inline int is_fadump_active(void) { return 0; } +static inline void crash_fadump(struct pt_regs *regs, const char *str) { } +#endif +#endif diff -uprN linux-2.6.32/arch/powerpc/include/asm/firmware.h linux-2.6.32.ovz/arch/powerpc/include/asm/firmware.h --- linux-2.6.32/arch/powerpc/include/asm/firmware.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/firmware.h 2015-03-10 13:22:22.000000000 -0700 @@ -46,6 +46,8 @@ #define FW_FEATURE_PS3_LV1 ASM_CONST(0x0000000000800000) #define FW_FEATURE_BEAT ASM_CONST(0x0000000001000000) #define FW_FEATURE_CMO ASM_CONST(0x0000000002000000) +#define FW_FEATURE_VPHN ASM_CONST(0x0000000004000000) +#define FW_FEATURE_XCMO ASM_CONST(0x0000000008000000) #ifndef __ASSEMBLY__ @@ -59,7 +61,7 @@ enum { FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN | FW_FEATURE_BULK_REMOVE | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE | FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | - FW_FEATURE_CMO, + FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, diff -uprN linux-2.6.32/arch/powerpc/include/asm/hvcall.h linux-2.6.32.ovz/arch/powerpc/include/asm/hvcall.h --- linux-2.6.32/arch/powerpc/include/asm/hvcall.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/hvcall.h 2015-03-10 13:23:53.000000000 -0700 @@ -73,7 +73,27 @@ #define H_MR_CONDITION -43 #define H_NOT_ENOUGH_RESOURCES -44 #define H_R_STATE -45 -#define H_RESCINDEND -46 +#define H_RESCINDED -46 +#define H_P2 -55 +#define H_P3 -56 +#define H_P4 -57 +#define H_P5 -58 +#define H_P6 -59 +#define H_P7 -60 +#define H_P8 -61 +#define H_P9 -62 +#define H_TOO_BIG -64 +#define H_OVERLAP -68 +#define H_INTERRUPT -69 +#define H_BAD_DATA -70 +#define H_NOT_ACTIVE -71 +#define H_SG_LIST -72 +#define H_OP_MODE -73 +#define H_COP_HW -74 +#define H_UNSUPPORTED_FLAG_START -256 +#define H_UNSUPPORTED_FLAG_END -511 +#define H_MULTI_THREADS_ACTIVE -9005 +#define H_OUTSTANDING_COP_OPS -9006 /* Long Busy is a condition that can be returned by the firmware @@ -101,6 +121,7 @@ #define H_ANDCOND (1UL<<(63-33)) #define H_ICACHE_INVALIDATE (1UL<<(63-40)) /* icbi, etc. (ignored for IO pages) */ #define H_ICACHE_SYNCHRONIZE (1UL<<(63-41)) /* dcbst, icbi, etc (ignored for IO pages */ +#define H_COALESCE_CAND (1UL<<(63-42)) /* page is a good candidate for coalescing */ #define H_ZERO_PAGE (1UL<<(63-48)) /* zero the page before mapping (ignored for IO pages) */ #define H_COPY_PAGE (1UL<<(63-49)) #define H_N (1UL<<(63-61)) @@ -215,9 +236,14 @@ #define H_JOIN 0x298 #define H_VASI_STATE 0x2A4 #define H_ENABLE_CRQ 0x2B0 +#define H_GET_EM_PARMS 0x2B8 #define H_SET_MPP 0x2D0 #define H_GET_MPP 0x2D4 -#define MAX_HCALL_OPCODE H_GET_MPP +#define H_HOME_NODE_ASSOCIATIVITY 0x2EC +#define H_RANDOM 0x300 +#define H_COP 0x304 +#define H_GET_MPP_X 0x314 +#define MAX_HCALL_OPCODE H_GET_MPP_X #ifndef __ASSEMBLY__ @@ -292,6 +318,16 @@ struct hvcall_mpp_data { int h_get_mpp(struct hvcall_mpp_data *); +struct hvcall_mpp_x_data { + unsigned long coalesced_bytes; + unsigned long pool_coalesced_bytes; + unsigned long pool_purr_cycles; + unsigned long pool_spurr_cycles; + unsigned long reserved[3]; +}; + +int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data); + #ifdef CONFIG_PPC_PSERIES extern int CMO_PrPSP; extern int CMO_SecPSP; diff -uprN linux-2.6.32/arch/powerpc/include/asm/hw_irq.h linux-2.6.32.ovz/arch/powerpc/include/asm/hw_irq.h --- linux-2.6.32/arch/powerpc/include/asm/hw_irq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/hw_irq.h 2015-03-10 13:21:27.000000000 -0700 @@ -135,43 +135,5 @@ static inline int irqs_disabled_flags(un */ struct irq_chip; -#ifdef CONFIG_PERF_EVENTS - -#ifdef CONFIG_PPC64 -static inline unsigned long test_perf_event_pending(void) -{ - unsigned long x; - - asm volatile("lbz %0,%1(13)" - : "=r" (x) - : "i" (offsetof(struct paca_struct, perf_event_pending))); - return x; -} - -static inline void set_perf_event_pending(void) -{ - asm volatile("stb %0,%1(13)" : : - "r" (1), - "i" (offsetof(struct paca_struct, perf_event_pending))); -} - -static inline void clear_perf_event_pending(void) -{ - asm volatile("stb %0,%1(13)" : : - "r" (0), - "i" (offsetof(struct paca_struct, perf_event_pending))); -} -#endif /* CONFIG_PPC64 */ - -#else /* CONFIG_PERF_EVENTS */ - -static inline unsigned long test_perf_event_pending(void) -{ - return 0; -} - -static inline void clear_perf_event_pending(void) {} -#endif /* CONFIG_PERF_EVENTS */ - #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/iommu.h linux-2.6.32.ovz/arch/powerpc/include/asm/iommu.h --- linux-2.6.32/arch/powerpc/include/asm/iommu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/iommu.h 2015-03-10 13:23:58.000000000 -0700 @@ -53,6 +53,16 @@ static __inline__ __attribute_const__ in */ #define IOMAP_MAX_ORDER 13 +#define IOMMU_POOL_HASHBITS 2 +#define IOMMU_NR_POOLS (1 << IOMMU_POOL_HASHBITS) + +struct iommu_pool { + unsigned long start; + unsigned long end; + unsigned long hint; + spinlock_t lock; +} ____cacheline_aligned_in_smp; + struct iommu_table { unsigned long it_busno; /* Bus number this table belongs to */ unsigned long it_size; /* Size of iommu table in entries */ @@ -61,10 +71,10 @@ struct iommu_table { unsigned long it_index; /* which iommu table this is */ unsigned long it_type; /* type: PCI or Virtual Bus */ unsigned long it_blocksize; /* Entries in each block (cacheline) */ - unsigned long it_hint; /* Hint for next alloc */ - unsigned long it_largehint; /* Hint for large allocs */ - unsigned long it_halfpoint; /* Breaking point for small/large allocs */ - spinlock_t it_lock; /* Protects it_map */ + unsigned long poolsize; + unsigned long nr_pools; + struct iommu_pool large_pool; + struct iommu_pool pools[IOMMU_NR_POOLS]; unsigned long *it_map; /* A simple allocation bitmap for now */ }; diff -uprN linux-2.6.32/arch/powerpc/include/asm/Kbuild linux-2.6.32.ovz/arch/powerpc/include/asm/Kbuild --- linux-2.6.32/arch/powerpc/include/asm/Kbuild 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/Kbuild 2015-03-10 13:24:09.000000000 -0700 @@ -22,6 +22,8 @@ header-y += sigcontext.h header-y += statfs.h header-y += ps3fb.h +generic-y += trace_clock.h + unifdef-y += bootx.h unifdef-y += byteorder.h unifdef-y += cputable.h diff -uprN linux-2.6.32/arch/powerpc/include/asm/kdump.h linux-2.6.32.ovz/arch/powerpc/include/asm/kdump.h --- linux-2.6.32/arch/powerpc/include/asm/kdump.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/kdump.h 2015-03-10 13:21:35.000000000 -0700 @@ -3,8 +3,17 @@ #include -/* Kdump kernel runs at 32 MB, change at your peril. */ +/* + * If CONFIG_RELOCATABLE is enabled we can place the kdump kernel anywhere. + * To keep enough space in the RMO for the first stage kernel on 64bit, we + * place it at 64MB. If CONFIG_RELOCATABLE is not enabled we must place + * the second stage at 32MB. + */ +#if defined(CONFIG_RELOCATABLE) && defined(CONFIG_PPC64) +#define KDUMP_KERNELBASE 0x4000000 +#else #define KDUMP_KERNELBASE 0x2000000 +#endif /* How many bytes to reserve at zero for kdump. The reserve limit should * be greater or equal to the trampoline's end address. diff -uprN linux-2.6.32/arch/powerpc/include/asm/kexec.h linux-2.6.32.ovz/arch/powerpc/include/asm/kexec.h --- linux-2.6.32/arch/powerpc/include/asm/kexec.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/kexec.h 2015-03-10 13:22:20.000000000 -0700 @@ -31,6 +31,10 @@ #define KEXEC_ARCH KEXEC_ARCH_PPC #endif +#define KEXEC_STATE_NONE 0 +#define KEXEC_STATE_IRQS_OFF 1 +#define KEXEC_STATE_REAL_MODE 2 + #ifndef __ASSEMBLY__ #include #include @@ -39,6 +43,34 @@ typedef void (*crash_shutdown_t)(void); #ifdef CONFIG_KEXEC +#ifdef CONFIG_KEXEC_AUTO_RESERVE + +#if PAGE_SHIFT==12 +#ifndef KEXEC_AUTO_THRESHOLD +#define KEXEC_AUTO_THRESHOLD (1ULL<<31) /* 2G */ +#endif +#else +#ifndef KEXEC_AUTO_THRESHOLD +#define KEXEC_AUTO_THRESHOLD (1ULL<<33) /* 8G */ +#endif +#endif /*PAGE_SHIFT == 12 */ + +#ifndef arch_default_crash_base +extern +unsigned long long __init arch_default_crash_base(void); +#define arch_default_crash_base arch_default_crash_base +#endif + +#ifndef arch_default_crash_size +extern +unsigned long long __init arch_default_crash_size(unsigned long long); +#define arch_default_crash_size arch_default_crash_size +#endif + +#endif + +#include + /* * This function is responsible for capturing register states if coming * via panic or invoking dump using sysrq-trigger. diff -uprN linux-2.6.32/arch/powerpc/include/asm/kvm_para.h linux-2.6.32.ovz/arch/powerpc/include/asm/kvm_para.h --- linux-2.6.32/arch/powerpc/include/asm/kvm_para.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/kvm_para.h 2015-03-10 13:23:40.000000000 -0700 @@ -32,6 +32,11 @@ static inline unsigned int kvm_arch_para return 0; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif /* __KERNEL__ */ #endif /* __POWERPC_KVM_PARA_H__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/local64.h linux-2.6.32.ovz/arch/powerpc/include/asm/local64.h --- linux-2.6.32/arch/powerpc/include/asm/local64.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/local64.h 2015-03-10 13:22:06.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/powerpc/include/asm/local.h linux-2.6.32.ovz/arch/powerpc/include/asm/local.h --- linux-2.6.32/arch/powerpc/include/asm/local.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/local.h 2015-03-10 13:21:31.000000000 -0700 @@ -4,6 +4,8 @@ #include #include +#define ARCH_USES_RELOC_ENTRIES + typedef struct { atomic_long_t a; @@ -24,7 +26,7 @@ static __inline__ long local_add_return( long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%2 # local_add_return\n\ +"1:" PPC_LLARX(%0,0,%2,0) " # local_add_return\n\ add %0,%1,%0\n" PPC405_ERR77(0,%2) PPC_STLCX "%0,0,%2 \n\ @@ -43,7 +45,7 @@ static __inline__ long local_sub_return( long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%2 # local_sub_return\n\ +"1:" PPC_LLARX(%0,0,%2,0) " # local_sub_return\n\ subf %0,%1,%0\n" PPC405_ERR77(0,%2) PPC_STLCX "%0,0,%2 \n\ @@ -60,7 +62,7 @@ static __inline__ long local_inc_return( long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%1 # local_inc_return\n\ +"1:" PPC_LLARX(%0,0,%1,0) " # local_inc_return\n\ addic %0,%0,1\n" PPC405_ERR77(0,%1) PPC_STLCX "%0,0,%1 \n\ @@ -87,7 +89,7 @@ static __inline__ long local_dec_return( long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%1 # local_dec_return\n\ +"1:" PPC_LLARX(%0,0,%1,0) " # local_dec_return\n\ addic %0,%0,-1\n" PPC405_ERR77(0,%1) PPC_STLCX "%0,0,%1\n\ @@ -117,7 +119,7 @@ static __inline__ int local_add_unless(l long t; __asm__ __volatile__ ( -"1:" PPC_LLARX "%0,0,%1 # local_add_unless\n\ +"1:" PPC_LLARX(%0,0,%1,0) " # local_add_unless\n\ cmpw 0,%0,%3 \n\ beq- 2f \n\ add %0,%2,%0 \n" @@ -147,7 +149,7 @@ static __inline__ long local_dec_if_posi long t; __asm__ __volatile__( -"1:" PPC_LLARX "%0,0,%1 # local_dec_if_positive\n\ +"1:" PPC_LLARX(%0,0,%1,0) " # local_dec_if_positive\n\ cmpwi %0,1\n\ addi %0,%0,-1\n\ blt- 2f\n" diff -uprN linux-2.6.32/arch/powerpc/include/asm/lppaca.h linux-2.6.32.ovz/arch/powerpc/include/asm/lppaca.h --- linux-2.6.32/arch/powerpc/include/asm/lppaca.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/lppaca.h 2015-03-10 13:22:11.000000000 -0700 @@ -62,7 +62,10 @@ struct lppaca { volatile u32 dyn_pir; // Dynamic ProcIdReg value x20-x23 u32 dsei_data; // DSEI data x24-x27 u64 sprg3; // SPRG3 value x28-x2F - u8 reserved3[80]; // Reserved x30-x7F + u8 reserved3[40]; // Reserved x30-x57 + volatile u8 vphn_assoc_counts[8]; // Virtual processor home node + // associativity change counters x58-x5F + u8 reserved4[32]; // Reserved x60-x7F //============================================================================= // CACHE_LINE_2 0x0080 - 0x00FF Contains local read-write data @@ -100,7 +103,14 @@ struct lppaca { // Used to pass parms from the OS to PLIC for SetAsrAndRfid u64 saved_gpr3; // Saved GPR3 x20-x27 u64 saved_gpr4; // Saved GPR4 x28-x2F - u64 saved_gpr5; // Saved GPR5 x30-x37 + union { + u64 saved_gpr5; /* Saved GPR5 x30-x37 */ + struct { + u8 cede_latency_hint; /* x30 */ + u8 reserved[7]; /* x31-x36 */ + } fields; + } gpr5_dword; + u8 dtl_enable_mask; // Dispatch Trace Log mask x38-x38 u8 donate_dedicated_cpu; // Donate dedicated CPU cycles x39-x39 @@ -163,6 +173,33 @@ struct slb_shadow { extern struct slb_shadow slb_shadow[]; +/* + * Layout of entries in the hypervisor's dispatch trace log buffer. + */ +struct dtl_entry { + u8 dispatch_reason; + u8 preempt_reason; + u16 processor_id; + u32 enqueue_to_dispatch_time; + u32 ready_to_enqueue_time; + u32 waiting_to_ready_time; + u64 timebase; + u64 fault_addr; + u64 srr0; + u64 srr1; +}; + +#define DISPATCH_LOG_BYTES 4096 /* bytes per cpu */ +#define N_DISPATCH_LOG (DISPATCH_LOG_BYTES / sizeof(struct dtl_entry)) + +/* + * When CONFIG_VIRT_CPU_ACCOUNTING = y, the cpu accounting code controls + * reading from the dispatch trace log. If other code wants to consume + * DTL entries, it can set this pointer to a function that will get + * called once for each DTL entry that gets processed. + */ +extern void (*dtl_consumer)(struct dtl_entry *entry, u64 index); + #endif /* CONFIG_PPC_BOOK3S */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_LPPACA_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/machdep.h linux-2.6.32.ovz/arch/powerpc/include/asm/machdep.h --- linux-2.6.32/arch/powerpc/include/asm/machdep.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/machdep.h 2015-03-10 13:22:05.000000000 -0700 @@ -102,6 +102,9 @@ struct machdep_calls { void (*pci_dma_dev_setup)(struct pci_dev *dev); void (*pci_dma_bus_setup)(struct pci_bus *bus); + /* Platform set_dma_mask override */ + int (*dma_set_mask)(struct device *dev, u64 dma_mask); + int (*probe)(void); void (*setup_arch)(void); /* Optional, may be NULL */ void (*init_early)(void); @@ -266,6 +269,11 @@ struct machdep_calls { void (*suspend_disable_irqs)(void); void (*suspend_enable_irqs)(void); #endif + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + ssize_t (*cpu_probe)(const char *, size_t); + ssize_t (*cpu_release)(const char *, size_t); +#endif }; extern void e500_idle(void); diff -uprN linux-2.6.32/arch/powerpc/include/asm/mmu-hash64.h linux-2.6.32.ovz/arch/powerpc/include/asm/mmu-hash64.h --- linux-2.6.32/arch/powerpc/include/asm/mmu-hash64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/mmu-hash64.h 2015-03-10 13:21:41.000000000 -0700 @@ -257,7 +257,9 @@ extern int hash_page(unsigned long ea, u extern int hash_huge_page(struct mm_struct *mm, unsigned long access, unsigned long ea, unsigned long vsid, int local, unsigned long trap); - +extern void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, unsigned long pte); extern int htab_bolt_mapping(unsigned long vstart, unsigned long vend, unsigned long pstart, unsigned long prot, int psize, int ssize); diff -uprN linux-2.6.32/arch/powerpc/include/asm/mmzone.h linux-2.6.32.ovz/arch/powerpc/include/asm/mmzone.h --- linux-2.6.32/arch/powerpc/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/mmzone.h 2015-03-10 13:22:35.000000000 -0700 @@ -33,15 +33,13 @@ extern int numa_cpu_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[]; #ifdef CONFIG_MEMORY_HOTPLUG extern unsigned long max_pfn; +u64 memory_hotplug_max(void); +#else +#define memory_hotplug_max() lmb_end_of_DRAM() #endif -/* - * Following are macros that each numa implmentation must define. - */ - -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) - +#else +#define memory_hotplug_max() lmb_end_of_DRAM() #endif /* CONFIG_NEED_MULTIPLE_NODES */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/module.h linux-2.6.32.ovz/arch/powerpc/include/asm/module.h --- linux-2.6.32/arch/powerpc/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/module.h 2015-03-10 13:21:12.000000000 -0700 @@ -60,16 +60,26 @@ struct mod_arch_specific { */ #ifdef __powerpc64__ +# define MODULES_ARE_ELF64 # define Elf_Shdr Elf64_Shdr # define Elf_Sym Elf64_Sym # define Elf_Ehdr Elf64_Ehdr +# define Elf_Rel Elf64_Rel +# define Elf_Rela Elf64_Rela +# define ELF_R_TYPE(X) ELF64_R_TYPE(X) +# define ELF_R_SYM(X) ELF64_R_SYM(X) # ifdef MODULE asm(".section .stubs,\"ax\",@nobits; .align 3; .previous"); # endif #else +# define MODULES_ARE_ELF32 # define Elf_Shdr Elf32_Shdr # define Elf_Sym Elf32_Sym # define Elf_Ehdr Elf32_Ehdr +# define Elf_Rel Elf32_Rel +# define Elf_Rela Elf32_Rela +# define ELF_R_TYPE(X) ELF32_R_TYPE(X) +# define ELF_R_SYM(X) ELF32_R_SYM(X) # ifdef MODULE asm(".section .plt,\"ax\",@nobits; .align 3; .previous"); asm(".section .init.plt,\"ax\",@nobits; .align 3; .previous"); @@ -87,5 +97,10 @@ struct exception_table_entry; void sort_ex_table(struct exception_table_entry *start, struct exception_table_entry *finish); +#ifdef CONFIG_MODVERSIONS +#define ARCH_RELOCATES_KCRCTAB + +extern const unsigned long reloc_start[]; +#endif #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_MODULE_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/mutex.h linux-2.6.32.ovz/arch/powerpc/include/asm/mutex.h --- linux-2.6.32/arch/powerpc/include/asm/mutex.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/mutex.h 2015-03-10 13:24:10.000000000 -0700 @@ -82,17 +82,15 @@ __mutex_fastpath_lock(atomic_t *count, v * __mutex_fastpath_lock_retval - try to take the lock by moving the count * from 1 to a 0 value * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 1 * - * Change the count from 1 to a value lower than 1, and call if - * it wasn't 1 originally. This function returns 0 if the fastpath succeeds, - * or anything the slow path function returns. + * Change the count from 1 to a value lower than 1. This function returns 0 + * if the fastpath succeeds, or -1 otherwise. */ static inline int -__mutex_fastpath_lock_retval(atomic_t *count, int (*fail_fn)(atomic_t *)) +__mutex_fastpath_lock_retval(atomic_t *count) { if (unlikely(__mutex_dec_return_lock(count) < 0)) - return fail_fn(count); + return -1; return 0; } diff -uprN linux-2.6.32/arch/powerpc/include/asm/nvram.h linux-2.6.32.ovz/arch/powerpc/include/asm/nvram.h --- linux-2.6.32/arch/powerpc/include/asm/nvram.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/nvram.h 2015-03-10 13:22:27.000000000 -0700 @@ -12,6 +12,7 @@ #include +/* Max bytes to read/write in one go */ #define NVRW_CNT 0x20 #define NVRAM_HEADER_LEN 16 /* sizeof(struct nvram_header) */ #define NVRAM_BLOCK_LEN 16 @@ -35,6 +36,7 @@ #define MOTO_RTC_CONTROLA 0x1FF8 #define MOTO_RTC_CONTROLB 0x1FF9 +/* Signatures for nvram partitions */ #define NVRAM_SIG_SP 0x02 /* support processor */ #define NVRAM_SIG_OF 0x50 /* open firmware config */ #define NVRAM_SIG_FW 0x51 /* general firmware */ @@ -54,6 +56,7 @@ struct nvram_header { unsigned char signature; unsigned char checksum; unsigned short length; + /* Terminating null required only for names < 12 chars. */ char name[12]; }; @@ -68,14 +71,15 @@ struct nvram_partition { }; +#ifdef CONFIG_PPC_PSERIES extern int nvram_write_error_log(char * buff, int length, unsigned int err_type, unsigned int err_seq); extern int nvram_read_error_log(char * buff, int length, unsigned int * err_type, unsigned int *err_seq); extern int nvram_clear_error_log(void); -extern struct nvram_partition *nvram_find_partition(int sig, const char *name); extern int pSeries_nvram_init(void); +#endif #ifdef CONFIG_MMIO_NVRAM extern int mmio_nvram_init(void); @@ -86,6 +90,20 @@ static inline int mmio_nvram_init(void) } #endif +/* + * This function is no longer used internally, but it's exported, so we + * retain it to preserve the kABI. + */ +extern struct nvram_partition *nvram_find_partition(int sig, const char *name); + +extern int __init nvram_scan_partitions(void); +extern loff_t nvram_create_partition(const char *name, int sig, + int req_size, int min_size); +extern int nvram_remove_partition(const char *name, int sig, + const char *const exceptions[]); +extern int nvram_get_partition_size(loff_t data_index); +extern loff_t nvram_find_partition2(const char *name, int sig, int *out_size); + #endif /* __KERNEL__ */ /* PowerMac specific nvram stuffs */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/paca.h linux-2.6.32.ovz/arch/powerpc/include/asm/paca.h --- linux-2.6.32/arch/powerpc/include/asm/paca.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/paca.h 2015-03-10 13:23:07.000000000 -0700 @@ -78,6 +78,8 @@ struct paca_struct { /* this becomes non-zero. */ #ifdef CONFIG_PPC_STD_MMU_64 struct slb_shadow *slb_shadow_ptr; + struct dtl_entry *dispatch_log; + struct dtl_entry *dispatch_log_end; /* * Now, starting in cacheline 2, the exception save areas @@ -122,13 +124,23 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ +#ifndef __GENKSYMS__ + u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ +#else u8 perf_event_pending; /* PM interrupt while soft-disabled */ +#endif /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ u64 system_time; /* accumulated system TB ticks */ - u64 startpurr; /* PURR/TB value snapshot */ + u64 user_time_scaled; /* accumulated usermode SPURR ticks */ + u64 starttime; /* TB value snapshot */ + u64 starttime_user; /* TB value on exit to usermode */ u64 startspurr; /* SPURR value snapshot */ + u64 utime_sspurr; /* ->user_time when ->startspurr set */ + u64 stolen_time; /* TB ticks taken by hypervisor */ + u64 dtl_ridx; /* read index in dispatch log */ + struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ }; extern struct paca_struct paca[]; diff -uprN linux-2.6.32/arch/powerpc/include/asm/page_64.h linux-2.6.32.ovz/arch/powerpc/include/asm/page_64.h --- linux-2.6.32/arch/powerpc/include/asm/page_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/page_64.h 2015-03-10 13:23:55.000000000 -0700 @@ -59,24 +59,7 @@ static __inline__ void clear_page(void * : "ctr", "memory"); } -extern void copy_4K_page(void *to, void *from); - -#ifdef CONFIG_PPC_64K_PAGES -static inline void copy_page(void *to, void *from) -{ - unsigned int i; - for (i=0; i < (1 << (PAGE_SHIFT - 12)); i++) { - copy_4K_page(to, from); - to += 4096; - from += 4096; - } -} -#else /* CONFIG_PPC_64K_PAGES */ -static inline void copy_page(void *to, void *from) -{ - copy_4K_page(to, from); -} -#endif /* CONFIG_PPC_64K_PAGES */ +extern void copy_page(void *to, void *from); /* Log 2 of page table size */ extern u64 ppc64_pft_size; diff -uprN linux-2.6.32/arch/powerpc/include/asm/page.h linux-2.6.32.ovz/arch/powerpc/include/asm/page.h --- linux-2.6.32/arch/powerpc/include/asm/page.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/page.h 2015-03-10 13:23:09.000000000 -0700 @@ -234,6 +234,7 @@ extern void clear_user_page(void *page, extern void copy_user_page(void *to, void *from, unsigned long vaddr, struct page *p); extern int page_is_ram(unsigned long pfn); +extern int devmem_is_allowed(unsigned long pfn); #ifdef CONFIG_PPC_SMLPAR void arch_free_page(struct page *page, int order); diff -uprN linux-2.6.32/arch/powerpc/include/asm/pci-bridge.h linux-2.6.32.ovz/arch/powerpc/include/asm/pci-bridge.h --- linux-2.6.32/arch/powerpc/include/asm/pci-bridge.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pci-bridge.h 2015-03-10 13:23:53.000000000 -0700 @@ -10,58 +10,10 @@ #include #include #include +#include struct device_node; -enum { - /* Force re-assigning all resources (ignore firmware - * setup completely) - */ - PPC_PCI_REASSIGN_ALL_RSRC = 0x00000001, - - /* Re-assign all bus numbers */ - PPC_PCI_REASSIGN_ALL_BUS = 0x00000002, - - /* Do not try to assign, just use existing setup */ - PPC_PCI_PROBE_ONLY = 0x00000004, - - /* Don't bother with ISA alignment unless the bridge has - * ISA forwarding enabled - */ - PPC_PCI_CAN_SKIP_ISA_ALIGN = 0x00000008, - - /* Enable domain numbers in /proc */ - PPC_PCI_ENABLE_PROC_DOMAINS = 0x00000010, - /* ... except for domain 0 */ - PPC_PCI_COMPAT_DOMAIN_0 = 0x00000020, -}; -#ifdef CONFIG_PCI -extern unsigned int ppc_pci_flags; - -static inline void ppc_pci_set_flags(int flags) -{ - ppc_pci_flags = flags; -} - -static inline void ppc_pci_add_flags(int flags) -{ - ppc_pci_flags |= flags; -} - -static inline int ppc_pci_has_flag(int flag) -{ - return (ppc_pci_flags & flag); -} -#else -static inline void ppc_pci_set_flags(int flags) { } -static inline void ppc_pci_add_flags(int flags) { } -static inline int ppc_pci_has_flag(int flag) -{ - return 0; -} -#endif - - /* * Structure of a PCI controller (host bridge) */ @@ -300,7 +252,6 @@ extern void pci_process_bridge_OF_ranges /* Allocate & free a PCI host bridge structure */ extern struct pci_controller *pcibios_alloc_controller(struct device_node *dev); extern void pcibios_free_controller(struct pci_controller *phb); -extern void pcibios_setup_phb_resources(struct pci_controller *hose); #ifdef CONFIG_PCI extern unsigned long pci_address_to_pio(phys_addr_t address); diff -uprN linux-2.6.32/arch/powerpc/include/asm/pci.h linux-2.6.32.ovz/arch/powerpc/include/asm/pci.h --- linux-2.6.32/arch/powerpc/include/asm/pci.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pci.h 2015-03-10 13:23:53.000000000 -0700 @@ -44,7 +44,7 @@ struct pci_dev; * bus numbers (don't do that on ppc64 yet !) */ #define pcibios_assign_all_busses() \ - (ppc_pci_has_flag(PPC_PCI_REASSIGN_ALL_BUS)) + (pci_has_flag(PCI_REASSIGN_ALL_BUS)) static inline void pcibios_set_master(struct pci_dev *dev) { @@ -141,37 +141,7 @@ extern int pci_mmap_legacy_page_range(st #define HAVE_PCI_LEGACY 1 -#if defined(CONFIG_PPC64) || defined(CONFIG_NOT_COHERENT_CACHE) -/* - * For 64-bit kernels, pci_unmap_{single,page} is not a nop. - * For 32-bit non-coherent kernels, pci_dma_sync_single_for_cpu() and - * so on are not nops. - * and thus... - */ -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ - dma_addr_t ADDR_NAME; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ - __u32 LEN_NAME; -#define pci_unmap_addr(PTR, ADDR_NAME) \ - ((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - (((PTR)->ADDR_NAME) = (VAL)) -#define pci_unmap_len(PTR, LEN_NAME) \ - ((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - (((PTR)->LEN_NAME) = (VAL)) - -#else /* 32-bit && coherent */ - -/* pci_unmap_{page,single} is a nop so... */ -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) -#define pci_unmap_addr(PTR, ADDR_NAME) (0) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) -#define pci_unmap_len(PTR, LEN_NAME) (0) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) - -#endif /* CONFIG_PPC64 || CONFIG_NOT_COHERENT_CACHE */ +#include #ifdef CONFIG_PPC64 @@ -191,14 +161,6 @@ extern int pci_mmap_legacy_page_range(st #endif /* CONFIG_PPC64 */ -extern void pcibios_resource_to_bus(struct pci_dev *dev, - struct pci_bus_region *region, - struct resource *res); - -extern void pcibios_bus_to_resource(struct pci_dev *dev, - struct resource *res, - struct pci_bus_region *region); - extern void pcibios_claim_one_bus(struct pci_bus *b); extern void pcibios_finish_adding_to_bus(struct pci_bus *bus); diff -uprN linux-2.6.32/arch/powerpc/include/asm/perf_event_fsl_emb.h linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event_fsl_emb.h --- linux-2.6.32/arch/powerpc/include/asm/perf_event_fsl_emb.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event_fsl_emb.h 2015-03-10 13:22:06.000000000 -0700 @@ -0,0 +1,50 @@ +/* + * Performance event support - Freescale embedded specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * Copyright 2010 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#define MAX_HWEVENTS 4 + +/* event flags */ +#define FSL_EMB_EVENT_VALID 1 +#define FSL_EMB_EVENT_RESTRICTED 2 + +/* upper half of event flags is PMLCb */ +#define FSL_EMB_EVENT_THRESHMUL 0x0000070000000000ULL +#define FSL_EMB_EVENT_THRESH 0x0000003f00000000ULL + +struct fsl_emb_pmu { + const char *name; + int n_counter; /* total number of counters */ + + /* + * The number of contiguous counters starting at zero that + * can hold restricted events, or zero if there are no + * restricted events. + * + * This isn't a very flexible method of expressing constraints, + * but it's very simple and is adequate for existing chips. + */ + int n_restricted; + + /* Returns event flags and PMLCb (FSL_EMB_EVENT_*) */ + u64 (*xlate_event)(u64 event_id); + + int n_generic; + int *generic_events; + int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +int register_fsl_emb_pmu(struct fsl_emb_pmu *); diff -uprN linux-2.6.32/arch/powerpc/include/asm/perf_event.h linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event.h --- linux-2.6.32/arch/powerpc/include/asm/perf_event.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event.h 2015-03-10 13:23:24.000000000 -0700 @@ -1,110 +1,40 @@ /* - * Performance event support - PowerPC-specific definitions. + * Performance event support - hardware-specific disambiguation * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * For now this is a compile-time decision, but eventually it should be + * runtime. This would allow multiplatform perf event support for e300 (fsl + * embedded perf counters) plus server/classic, and would accommodate + * devices other than the core which provide their own performance counters. + * + * Copyright 2010 Freescale Semiconductor, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ -#include - -#include -#define MAX_HWEVENTS 8 -#define MAX_EVENT_ALTERNATIVES 8 -#define MAX_LIMITED_HWCOUNTERS 2 - -/* - * This struct provides the constants and functions needed to - * describe the PMU on a particular POWER-family CPU. - */ -struct power_pmu { - const char *name; - int n_counter; - int max_alternatives; - unsigned long add_fields; - unsigned long test_adder; - int (*compute_mmcr)(u64 events[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]); - int (*get_constraint)(u64 event_id, unsigned long *mskp, - unsigned long *valp); - int (*get_alternatives)(u64 event_id, unsigned int flags, - u64 alt[]); - void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); - int (*limited_pmc_event)(u64 event_id); - u32 flags; - int n_generic; - int *generic_events; - int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; -}; - -/* - * Values for power_pmu.flags - */ -#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ -#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ - -/* - * Values for flags to get_alternatives() - */ -#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ -#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ -#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ - -extern int register_power_pmu(struct power_pmu *); +#ifdef CONFIG_PPC_PERF_CTRS +#include +#endif -struct pt_regs; -extern unsigned long perf_misc_flags(struct pt_regs *regs); -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); +#ifdef CONFIG_FSL_EMB_PERF_EVENT +#include +#endif -#define PERF_EVENT_INDEX_OFFSET 1 +#ifdef CONFIG_PERF_EVENTS +#include +#include /* - * Only override the default definitions in include/linux/perf_event.h - * if we have hardware PMU support. - */ -#ifdef CONFIG_PPC_PERF_CTRS -#define perf_misc_flags(regs) perf_misc_flags(regs) + * Overload regs->result to specify whether we should use the MSR (result + * is zero) or the SIAR (result is non zero). + */ +#define perf_arch_fetch_caller_regs(regs, __ip) \ + do { \ + (regs)->result = 0; \ + (regs)->nip = __ip; \ + (regs)->gpr[1] = *(unsigned long *)__get_SP(); \ + asm volatile("mfmsr %0" : "=r" ((regs)->msr)); \ + } while (0) #endif - -/* - * The power_pmu.get_constraint function returns a 32/64-bit value and - * a 32/64-bit mask that express the constraints between this event_id and - * other events. - * - * The value and mask are divided up into (non-overlapping) bitfields - * of three different types: - * - * Select field: this expresses the constraint that some set of bits - * in MMCR* needs to be set to a specific value for this event_id. For a - * select field, the mask contains 1s in every bit of the field, and - * the value contains a unique value for each possible setting of the - * MMCR* bits. The constraint checking code will ensure that two events - * that set the same field in their masks have the same value in their - * value dwords. - * - * Add field: this expresses the constraint that there can be at most - * N events in a particular class. A field of k bits can be used for - * N <= 2^(k-1) - 1. The mask has the most significant bit of the field - * set (and the other bits 0), and the value has only the least significant - * bit of the field set. In addition, the 'add_fields' and 'test_adder' - * in the struct power_pmu for this processor come into play. The - * add_fields value contains 1 in the LSB of the field, and the - * test_adder contains 2^(k-1) - 1 - N in the field. - * - * NAND field: this expresses the constraint that you may not have events - * in all of a set of classes. (For example, on PPC970, you can't select - * events from the FPU, ISU and IDU simultaneously, although any two are - * possible.) For N classes, the field is N+1 bits wide, and each class - * is assigned one bit from the least-significant N bits. The mask has - * only the most-significant bit set, and the value has only the bit - * for the event_id's class set. The test_adder has the least significant - * bit set in the field. - * - * If an event_id is not subject to the constraint expressed by a particular - * field, then it will have 0 in both the mask and value for that field. - */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/perf_event_server.h linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event_server.h --- linux-2.6.32/arch/powerpc/include/asm/perf_event_server.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/perf_event_server.h 2015-03-10 13:24:00.000000000 -0700 @@ -0,0 +1,113 @@ +/* + * Performance event support - PowerPC classic/server specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#define MAX_HWEVENTS 8 +#define MAX_EVENT_ALTERNATIVES 8 +#define MAX_LIMITED_HWCOUNTERS 2 + +/* + * This struct provides the constants and functions needed to + * describe the PMU on a particular POWER-family CPU. + */ +struct power_pmu { + const char *name; + int n_counter; + int max_alternatives; + unsigned long add_fields; + unsigned long test_adder; + int (*compute_mmcr)(u64 events[], int n_ev, + unsigned int hwc[], unsigned long mmcr[]); + int (*get_constraint)(u64 event_id, unsigned long *mskp, + unsigned long *valp); + int (*get_alternatives)(u64 event_id, unsigned int flags, + u64 alt[]); + void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); + int (*limited_pmc_event)(u64 event_id); + u32 flags; + int n_generic; + int *generic_events; + int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +/* + * Values for power_pmu.flags + */ +#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ +#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ +#define PPMU_NO_SIPR 4 /* no SIPR/HV in MMCRA at all */ +#define PPMU_NO_CONT_SAMPLING 8 /* no continuous sampling */ +#define PPMU_SIAR_VALID 16 /* Processor has SIAR Valid bit */ +#define PPMU_HAS_SSLOT 0x00000020 /* Has sampled slot in MMCRA */ +#define PPMU_HAS_SIER 0x00000040 /* Has SIER */ + +/* + * Values for flags to get_alternatives() + */ +#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ +#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ +#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ + +extern int register_power_pmu(struct power_pmu *); + +struct pt_regs; +extern unsigned long perf_misc_flags(struct pt_regs *regs); +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + +/* + * Only override the default definitions in include/linux/perf_event.h + * if we have hardware PMU support. + */ +#ifdef CONFIG_PPC_PERF_CTRS +#define perf_misc_flags(regs) perf_misc_flags(regs) +#endif + +/* + * The power_pmu.get_constraint function returns a 32/64-bit value and + * a 32/64-bit mask that express the constraints between this event_id and + * other events. + * + * The value and mask are divided up into (non-overlapping) bitfields + * of three different types: + * + * Select field: this expresses the constraint that some set of bits + * in MMCR* needs to be set to a specific value for this event_id. For a + * select field, the mask contains 1s in every bit of the field, and + * the value contains a unique value for each possible setting of the + * MMCR* bits. The constraint checking code will ensure that two events + * that set the same field in their masks have the same value in their + * value dwords. + * + * Add field: this expresses the constraint that there can be at most + * N events in a particular class. A field of k bits can be used for + * N <= 2^(k-1) - 1. The mask has the most significant bit of the field + * set (and the other bits 0), and the value has only the least significant + * bit of the field set. In addition, the 'add_fields' and 'test_adder' + * in the struct power_pmu for this processor come into play. The + * add_fields value contains 1 in the LSB of the field, and the + * test_adder contains 2^(k-1) - 1 - N in the field. + * + * NAND field: this expresses the constraint that you may not have events + * in all of a set of classes. (For example, on PPC970, you can't select + * events from the FPU, ISU and IDU simultaneously, although any two are + * possible.) For N classes, the field is N+1 bits wide, and each class + * is assigned one bit from the least-significant N bits. The mask has + * only the most-significant bit set, and the value has only the bit + * for the event_id's class set. The test_adder has the least significant + * bit set in the field. + * + * If an event_id is not subject to the constraint expressed by a particular + * field, then it will have 0 in both the mask and value for that field. + */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/pgalloc-64.h linux-2.6.32.ovz/arch/powerpc/include/asm/pgalloc-64.h --- linux-2.6.32/arch/powerpc/include/asm/pgalloc-64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pgalloc-64.h 2015-03-10 13:21:20.000000000 -0700 @@ -11,6 +11,12 @@ #include #include +struct vmemmap_backing { + struct vmemmap_backing *list; + unsigned long phys; + unsigned long virt_addr; +}; + #ifndef CONFIG_PPC_SUBPAGE_PROT static inline void subpage_prot_free(pgd_t *pgd) {} #endif diff -uprN linux-2.6.32/arch/powerpc/include/asm/pgtable.h linux-2.6.32.ovz/arch/powerpc/include/asm/pgtable.h --- linux-2.6.32/arch/powerpc/include/asm/pgtable.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pgtable.h 2015-03-10 13:22:21.000000000 -0700 @@ -170,6 +170,7 @@ extern int ptep_set_access_flags(struct #define pgprot_cached_wthru(prot) (__pgprot((pgprot_val(prot) & ~_PAGE_CACHE_CTL) | \ _PAGE_COHERENT | _PAGE_WRITETHRU)) +#define pgprot_writecombine pgprot_noncached_wc struct file; extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, diff -uprN linux-2.6.32/arch/powerpc/include/asm/phyp_dump.h linux-2.6.32.ovz/arch/powerpc/include/asm/phyp_dump.h --- linux-2.6.32/arch/powerpc/include/asm/phyp_dump.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/phyp_dump.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,47 +0,0 @@ -/* - * Hypervisor-assisted dump - * - * Linas Vepstas, Manish Ahuja 2008 - * Copyright 2008 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _PPC64_PHYP_DUMP_H -#define _PPC64_PHYP_DUMP_H - -#ifdef CONFIG_PHYP_DUMP - -/* The RMR region will be saved for later dumping - * whenever the kernel crashes. Set this to 256MB. */ -#define PHYP_DUMP_RMR_START 0x0 -#define PHYP_DUMP_RMR_END (1UL<<28) - -struct phyp_dump { - /* Memory that is reserved during very early boot. */ - unsigned long init_reserve_start; - unsigned long init_reserve_size; - /* cmd line options during boot */ - unsigned long reserve_bootvar; - unsigned long phyp_dump_at_boot; - /* Check status during boot if dump supported, active & present*/ - unsigned long phyp_dump_configured; - unsigned long phyp_dump_is_active; - /* store cpu & hpte size */ - unsigned long cpu_state_size; - unsigned long hpte_region_size; - /* previous scratch area values */ - unsigned long reserved_scratch_addr; - unsigned long reserved_scratch_size; -}; - -extern struct phyp_dump *phyp_dump_info; - -int early_init_dt_scan_phyp_dump(unsigned long node, - const char *uname, int depth, void *data); - -#endif /* CONFIG_PHYP_DUMP */ -#endif /* _PPC64_PHYP_DUMP_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/ppc_asm.h linux-2.6.32.ovz/arch/powerpc/include/asm/ppc_asm.h --- linux-2.6.32/arch/powerpc/include/asm/ppc_asm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/ppc_asm.h 2015-03-10 13:21:47.000000000 -0700 @@ -9,6 +9,7 @@ #include #include #include +#include #ifndef __ASSEMBLY__ #error __FILE__ should only be used in assembler files @@ -26,17 +27,13 @@ #ifndef CONFIG_VIRT_CPU_ACCOUNTING #define ACCOUNT_CPU_USER_ENTRY(ra, rb) #define ACCOUNT_CPU_USER_EXIT(ra, rb) +#define ACCOUNT_STOLEN_TIME #else #define ACCOUNT_CPU_USER_ENTRY(ra, rb) \ beq 2f; /* if from kernel mode */ \ -BEGIN_FTR_SECTION; \ - mfspr ra,SPRN_PURR; /* get processor util. reg */ \ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); \ -BEGIN_FTR_SECTION; \ - MFTB(ra); /* or get TB if no PURR */ \ -END_FTR_SECTION_IFCLR(CPU_FTR_PURR); \ - ld rb,PACA_STARTPURR(r13); \ - std ra,PACA_STARTPURR(r13); \ + MFTB(ra); /* get timebase */ \ + ld rb,PACA_STARTTIME_USER(r13); \ + std ra,PACA_STARTTIME(r13); \ subf rb,rb,ra; /* subtract start value */ \ ld ra,PACA_USER_TIME(r13); \ add ra,ra,rb; /* add on to user time */ \ @@ -44,19 +41,34 @@ END_FTR_SECTION_IFCLR(CPU_FTR_PURR); 2: #define ACCOUNT_CPU_USER_EXIT(ra, rb) \ -BEGIN_FTR_SECTION; \ - mfspr ra,SPRN_PURR; /* get processor util. reg */ \ -END_FTR_SECTION_IFSET(CPU_FTR_PURR); \ -BEGIN_FTR_SECTION; \ - MFTB(ra); /* or get TB if no PURR */ \ -END_FTR_SECTION_IFCLR(CPU_FTR_PURR); \ - ld rb,PACA_STARTPURR(r13); \ - std ra,PACA_STARTPURR(r13); \ + MFTB(ra); /* get timebase */ \ + ld rb,PACA_STARTTIME(r13); \ + std ra,PACA_STARTTIME_USER(r13); \ subf rb,rb,ra; /* subtract start value */ \ ld ra,PACA_SYSTEM_TIME(r13); \ - add ra,ra,rb; /* add on to user time */ \ - std ra,PACA_SYSTEM_TIME(r13); -#endif + add ra,ra,rb; /* add on to system time */ \ + std ra,PACA_SYSTEM_TIME(r13) + +#ifdef CONFIG_PPC_SPLPAR +#define ACCOUNT_STOLEN_TIME \ +BEGIN_FW_FTR_SECTION; \ + beq 33f; \ + /* from user - see if there are any DTL entries to process */ \ + ld r10,PACALPPACAPTR(r13); /* get ptr to VPA */ \ + ld r11,PACA_DTL_RIDX(r13); /* get log read index */ \ + ld r10,LPPACA_DTLIDX(r10); /* get log write index */ \ + cmpd cr1,r11,r10; \ + beq+ cr1,33f; \ + bl .accumulate_stolen_time; \ +33: \ +END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) + +#else /* CONFIG_PPC_SPLPAR */ +#define ACCOUNT_STOLEN_TIME + +#endif /* CONFIG_PPC_SPLPAR */ + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ /* * Macros for storing registers into and loading registers from diff -uprN linux-2.6.32/arch/powerpc/include/asm/ppc-opcode.h linux-2.6.32.ovz/arch/powerpc/include/asm/ppc-opcode.h --- linux-2.6.32/arch/powerpc/include/asm/ppc-opcode.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/ppc-opcode.h 2015-03-10 13:21:31.000000000 -0700 @@ -22,8 +22,10 @@ #define PPC_INST_DCBZL 0x7c2007ec #define PPC_INST_ISEL 0x7c00001e #define PPC_INST_ISEL_MASK 0xfc00003e +#define PPC_INST_LDARX 0x7c0000a8 #define PPC_INST_LSWI 0x7c0004aa #define PPC_INST_LSWX 0x7c00042a +#define PPC_INST_LWARX 0x7c000028 #define PPC_INST_LWSYNC 0x7c2004ac #define PPC_INST_LXVD2X 0x7c000698 #define PPC_INST_MCRXR 0x7c000400 @@ -55,15 +57,31 @@ #define __PPC_RA(a) (((a) & 0x1f) << 16) #define __PPC_RB(b) (((b) & 0x1f) << 11) #define __PPC_RS(s) (((s) & 0x1f) << 21) +#define __PPC_RT(s) __PPC_RS(s) #define __PPC_XS(s) ((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5)) #define __PPC_T_TLB(t) (((t) & 0x3) << 21) #define __PPC_WC(w) (((w) & 0x3) << 21) +/* + * Only use the larx hint bit on 64bit CPUs. e500v1/v2 based CPUs will treat a + * larx with EH set as an illegal instruction. + */ +#ifdef CONFIG_PPC64 +#define __PPC_EH(eh) (((eh) & 0x1) << 0) +#else +#define __PPC_EH(eh) 0 +#endif /* Deal with instructions that older assemblers aren't aware of */ #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ __PPC_RA(a) | __PPC_RB(b)) +#define PPC_LDARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LDARX | \ + __PPC_RT(t) | __PPC_RA(a) | \ + __PPC_RB(b) | __PPC_EH(eh)) +#define PPC_LWARX(t, a, b, eh) stringify_in_c(.long PPC_INST_LWARX | \ + __PPC_RT(t) | __PPC_RA(a) | \ + __PPC_RB(b) | __PPC_EH(eh)) #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ __PPC_RB(b)) #define PPC_RFCI stringify_in_c(.long PPC_INST_RFCI) diff -uprN linux-2.6.32/arch/powerpc/include/asm/ppc-pci.h linux-2.6.32.ovz/arch/powerpc/include/asm/ppc-pci.h --- linux-2.6.32/arch/powerpc/include/asm/ppc-pci.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/ppc-pci.h 2015-03-10 13:23:53.000000000 -0700 @@ -45,8 +45,6 @@ extern void init_pci_config_tokens (void extern unsigned long get_phb_buid (struct device_node *); extern int rtas_setup_phb(struct pci_controller *phb); -extern unsigned long pci_probe_only; - /* ---- EEH internal-use-only related routines ---- */ #ifdef CONFIG_EEH @@ -137,6 +135,16 @@ struct device_node * find_device_pe(stru void eeh_sysfs_add_device(struct pci_dev *pdev); void eeh_sysfs_remove_device(struct pci_dev *pdev); +static inline const char *eeh_pci_name(struct pci_dev *pdev) +{ + return pdev ? pci_name(pdev) : ""; +} + +static inline const char *eeh_driver_name(struct pci_dev *pdev) +{ + return (pdev && pdev->driver) ? pdev->driver->name : ""; +} + #endif /* CONFIG_EEH */ #else /* CONFIG_PCI */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/pSeries_reconfig.h linux-2.6.32.ovz/arch/powerpc/include/asm/pSeries_reconfig.h --- linux-2.6.32/arch/powerpc/include/asm/pSeries_reconfig.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/pSeries_reconfig.h 2015-03-10 13:24:13.000000000 -0700 @@ -13,16 +13,33 @@ #define PSERIES_RECONFIG_REMOVE 0x0002 #define PSERIES_DRCONF_MEM_ADD 0x0003 #define PSERIES_DRCONF_MEM_REMOVE 0x0004 +#define PSERIES_UPDATE_PROPERTY 0x0005 + +/** + * pSeries_reconfig_notify - Notifier value structure for OFDT property updates + * + * @node: Device tree node which owns the property being updated + * @property: Updated property + */ +struct pSeries_reconfig_prop_update { + struct device_node *node; + struct property *property; +}; #ifdef CONFIG_PPC_PSERIES extern int pSeries_reconfig_notifier_register(struct notifier_block *); extern void pSeries_reconfig_notifier_unregister(struct notifier_block *); +extern struct blocking_notifier_head pSeries_reconfig_chain; +/* Not the best place to put this, will be fixed when we move some + * of the rtas suspend-me stuff to pseries */ +extern void pSeries_coalesce_init(void); #else /* !CONFIG_PPC_PSERIES */ static inline int pSeries_reconfig_notifier_register(struct notifier_block *nb) { return 0; } static inline void pSeries_reconfig_notifier_unregister(struct notifier_block *nb) { } +static inline void pSeries_coalesce_init(void) { } #endif /* CONFIG_PPC_PSERIES */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/ptrace.h linux-2.6.32.ovz/arch/powerpc/include/asm/ptrace.h --- linux-2.6.32/arch/powerpc/include/asm/ptrace.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/ptrace.h 2015-03-10 13:22:23.000000000 -0700 @@ -83,7 +83,20 @@ struct pt_regs { #define instruction_pointer(regs) ((regs)->nip) #define user_stack_pointer(regs) ((regs)->gpr[1]) -#define regs_return_value(regs) ((regs)->gpr[3]) + +static inline int is_syscall_success(struct pt_regs *regs) +{ + return !(regs->ccr & 0x10000000); +} + +static inline long regs_return_value(struct pt_regs *regs) +{ + if (is_syscall_success(regs)) + return regs->gpr[3]; + else + return -regs->gpr[3]; +} + #ifdef CONFIG_SMP extern unsigned long profile_pc(struct pt_regs *regs); @@ -122,8 +135,10 @@ extern int ptrace_put_reg(struct task_st #endif /* ! __powerpc64__ */ #define TRAP(regs) ((regs)->trap & ~0xF) #ifdef __powerpc64__ +#define NV_REG_POISON 0xdeadbeefdeadbeefUL #define CHECK_FULL_REGS(regs) BUG_ON(regs->trap & 1) #else +#define NV_REG_POISON 0xdeadbeef #define CHECK_FULL_REGS(regs) \ do { \ if ((regs)->trap & 1) \ @@ -140,6 +155,8 @@ extern void user_enable_single_step(stru extern void user_enable_block_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); +#define ARCH_HAS_USER_SINGLE_STEP_INFO + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/reg.h linux-2.6.32.ovz/arch/powerpc/include/asm/reg.h --- linux-2.6.32/arch/powerpc/include/asm/reg.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/reg.h 2015-03-10 13:24:00.000000000 -0700 @@ -413,6 +413,7 @@ #define SPRN_SPRG1 0x111 /* Special Purpose Register General 1 */ #define SPRN_SPRG2 0x112 /* Special Purpose Register General 2 */ #define SPRN_SPRG3 0x113 /* Special Purpose Register General 3 */ +#define SPRN_USPRG3 0x103 /* SPRG3 userspace read */ #define SPRN_SPRG4 0x114 /* Special Purpose Register General 4 */ #define SPRN_SPRG5 0x115 /* Special Purpose Register General 5 */ #define SPRN_SPRG6 0x116 /* Special Purpose Register General 6 */ @@ -489,6 +490,8 @@ #define SPRN_MMCR1 798 #define SPRN_MMCRA 0x312 #define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */ +#define MMCRA_SDAR_DCACHE_MISS 0x40000000UL +#define MMCRA_SDAR_ERAT_MISS 0x20000000UL #define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */ #define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */ #define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */ @@ -499,6 +502,10 @@ #define POWER6_MMCRA_SIPR 0x0000020000000000ULL #define POWER6_MMCRA_THRM 0x00000020UL #define POWER6_MMCRA_OTHER 0x0000000EUL + +#define POWER7P_MMCRA_SIAR_VALID 0x10000000 /* P7+ SIAR contents valid */ +#define POWER7P_MMCRA_SDAR_VALID 0x08000000 /* P7+ SDAR contents valid */ + #define SPRN_PMC1 787 #define SPRN_PMC2 788 #define SPRN_PMC3 789 @@ -509,6 +516,11 @@ #define SPRN_PMC8 794 #define SPRN_SIAR 780 #define SPRN_SDAR 781 +#define SPRN_SIER 784 +#define SIER_SIPR 0x2000000 /* Sampled MSR_PR */ +#define SIER_SIHV 0x1000000 /* Sampled MSR_HV */ +#define SIER_SIAR_VALID 0x0400000 /* SIAR contents valid */ +#define SIER_SDAR_VALID 0x0200000 /* SDAR contents valid */ #define SPRN_PA6T_MMCR0 795 #define PA6T_MMCR0_EN0 0x0000000000000001UL @@ -650,12 +662,12 @@ * 64-bit server: * - SPRG0 unused (reserved for HV on Power4) * - SPRG2 scratch for exception vectors - * - SPRG3 unused (user visible) + * - SPRG3 CPU and NUMA node for VDSO getcpu (user visible) * * 64-bit embedded * - SPRG0 generic exception scratch * - SPRG2 TLB exception stack - * - SPRG3 unused (user visible) + * - SPRG3 CPU and NUMA node for VDSO getcpu (user visible) * - SPRG4 unused (user visible) * - SPRG6 TLB miss scratch (user visible, sorry !) * - SPRG7 critical exception scratch @@ -858,11 +870,13 @@ #define PV_970 0x0039 #define PV_POWER5 0x003A #define PV_POWER5p 0x003B +#define PV_POWER7 0x003F #define PV_970FX 0x003C #define PV_630 0x0040 #define PV_630p 0x0041 #define PV_970MP 0x0044 #define PV_970GX 0x0045 +#define PVR_POWER7p 0x004A #define PV_BE 0x0070 #define PV_PA6T 0x0090 diff -uprN linux-2.6.32/arch/powerpc/include/asm/rtas.h linux-2.6.32.ovz/arch/powerpc/include/asm/rtas.h --- linux-2.6.32/arch/powerpc/include/asm/rtas.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/rtas.h 2015-03-10 13:24:05.000000000 -0700 @@ -63,6 +63,14 @@ struct rtas_t { struct device_node *dev; /* virtual address pointer */ }; +struct rtas_suspend_me_data { + atomic_t working; /* number of cpus accessing this struct */ + int done; + int token; /* ibm,suspend-me */ + int error; + struct completion *complete; /* wait on this until working == 0 */ +}; + /* RTAS event classes */ #define RTAS_INTERNAL_ERROR 0x80000000 /* set bit 0 */ #define RTAS_EPOW_WARNING 0x40000000 /* set bit 1 */ @@ -147,9 +155,85 @@ struct rtas_error_log { unsigned long target:4; /* Target of failed operation */ unsigned long type:8; /* General event or error*/ unsigned long extended_log_length:32; /* length in bytes */ - unsigned char buffer[1]; + unsigned char buffer[1]; /* Start of extended log */ + /* Variable length. */ +}; + +#define RTAS_V6EXT_LOG_FORMAT_EVENT_LOG 14 + +#define RTAS_V6EXT_COMPANY_ID_IBM (('I' << 24) | ('B' << 16) | ('M' << 8)) + +/* RTAS general extended event log, Version 6. The extended log starts + * from "buffer" field of struct rtas_error_log defined above. + */ +struct rtas_ext_event_log_v6 { + /* Byte 0 */ + uint32_t log_valid:1; /* 1:Log valid */ + uint32_t unrecoverable_error:1; /* 1:Unrecoverable error */ + uint32_t recoverable_error:1; /* 1:recoverable (correctable */ + /* or successfully retried) */ + uint32_t degraded_operation:1; /* 1:Unrecoverable err, bypassed*/ + /* - degraded operation (e.g. */ + /* CPU or mem taken off-line) */ + uint32_t predictive_error:1; + uint32_t new_log:1; /* 1:"New" log (Always 1 for */ + /* data returned from RTAS */ + uint32_t big_endian:1; /* 1: Big endian */ + uint32_t :1; /* reserved */ + /* Byte 1 */ + uint32_t :8; /* reserved */ + /* Byte 2 */ + uint32_t powerpc_format:1; /* Set to 1 (indicating log is */ + /* in PowerPC format */ + uint32_t :3; /* reserved */ + uint32_t log_format:4; /* Log format indicator. Define */ + /* format used for byte 12-2047 */ + /* Byte 3 */ + uint32_t :8; /* reserved */ + /* Byte 4-11 */ + uint8_t reserved[8]; /* reserved */ + /* Byte 12-15 */ + uint32_t company_id; /* Company ID of the company */ + /* that defines the format for */ + /* the vendor specific log type */ + /* Byte 16-end of log */ + uint8_t vendor_log[1]; /* Start of vendor specific log */ + /* Variable length. */ }; +/* pSeries event log format */ + +/* Two bytes ASCII section IDs */ +#define PSERIES_ELOG_SECT_ID_PRIV_HDR (('P' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_USER_HDR (('U' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_PRIMARY_SRC (('P' << 8) | 'S') +#define PSERIES_ELOG_SECT_ID_EXTENDED_UH (('E' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_FAILING_MTMS (('M' << 8) | 'T') +#define PSERIES_ELOG_SECT_ID_SECONDARY_SRC (('S' << 8) | 'S') +#define PSERIES_ELOG_SECT_ID_DUMP_LOCATOR (('D' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_FW_ERROR (('S' << 8) | 'W') +#define PSERIES_ELOG_SECT_ID_IMPACT_PART_ID (('L' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_LOGIC_RESOURCE_ID (('L' << 8) | 'R') +#define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M') +#define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E') +#define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I') +#define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') + +/* Vendor specific Platform Event Log Format, Version 6, section header */ +struct pseries_errorlog { + uint16_t id; /* 0x00 2-byte ASCII section ID */ + uint16_t length; /* 0x02 Section length in bytes */ + uint8_t version; /* 0x04 Section version */ + uint8_t subtype; /* 0x05 Section subtype */ + uint16_t creator_component; /* 0x06 Creator component ID */ + uint8_t data[]; /* 0x08 Start of section data */ +}; + +struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id); + /* * This can be set by the rtas_flash module so that it can get called * as the absolutely last thing before the kernel terminates. @@ -174,6 +258,10 @@ extern int rtas_set_indicator(int indica extern int rtas_set_indicator_fast(int indicator, int index, int new_value); extern void rtas_progress(char *s, unsigned short hex); extern void rtas_initialize(void); +extern int rtas_suspend_cpu(struct rtas_suspend_me_data *data); +extern int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data); +extern int rtas_online_cpus_mask(cpumask_var_t cpus); +extern int rtas_offline_cpus_mask(cpumask_var_t cpus); struct rtc_time; extern unsigned long rtas_get_boot_time(void); @@ -188,6 +276,12 @@ extern int early_init_dt_scan_rtas(unsig extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); +#ifdef CONFIG_PPC_PSERIES +extern void rtas_cancel_event_scan(void); +#else +static inline void rtas_cancel_event_scan(void) { } +#endif + /* Error types logged. */ #define ERR_FLAG_ALREADY_LOGGED 0x0 #define ERR_FLAG_BOOT 0x1 /* log was pulled from NVRAM on boot */ @@ -248,5 +342,17 @@ static inline u32 rtas_config_addr(int b extern void __cpuinit rtas_give_timebase(void); extern void __cpuinit rtas_take_timebase(void); +#ifdef CONFIG_PPC_RTAS +static inline int page_is_rtas_user_buf(unsigned long pfn) +{ + unsigned long paddr = (pfn << PAGE_SHIFT); + if (paddr >= rtas_rmo_buf && paddr < (rtas_rmo_buf + RTAS_RMOBUF_MAX)) + return 1; + return 0; +} +#else +static inline int page_is_rtas_user_buf(unsigned long pfn) { return 0;} +#endif + #endif /* __KERNEL__ */ #endif /* _POWERPC_RTAS_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/socket.h linux-2.6.32.ovz/arch/powerpc/include/asm/socket.h --- linux-2.6.32/arch/powerpc/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/socket.h 2015-03-10 13:24:07.000000000 -0700 @@ -29,7 +29,7 @@ #define SO_PRIORITY 12 #define SO_LINGER 13 #define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ +#define SO_REUSEPORT 15 #define SO_RCVLOWAT 16 #define SO_SNDLOWAT 17 #define SO_RCVTIMEO 18 @@ -67,4 +67,9 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 + +#define SO_BUSY_POLL 46 +#define SO_BPF_EXTENSIONS 48 + #endif /* _ASM_POWERPC_SOCKET_H */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/sparsemem.h linux-2.6.32.ovz/arch/powerpc/include/asm/sparsemem.h --- linux-2.6.32/arch/powerpc/include/asm/sparsemem.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/sparsemem.h 2015-03-10 13:22:51.000000000 -0700 @@ -16,7 +16,7 @@ #endif /* CONFIG_SPARSEMEM */ #ifdef CONFIG_MEMORY_HOTPLUG -extern void create_section_mapping(unsigned long start, unsigned long end); +extern int create_section_mapping(unsigned long start, unsigned long end); extern int remove_section_mapping(unsigned long start, unsigned long end); #ifdef CONFIG_NUMA extern int hot_add_scn_to_nid(unsigned long scn_addr); diff -uprN linux-2.6.32/arch/powerpc/include/asm/spinlock.h linux-2.6.32.ovz/arch/powerpc/include/asm/spinlock.h --- linux-2.6.32/arch/powerpc/include/asm/spinlock.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/spinlock.h 2015-03-10 13:24:15.000000000 -0700 @@ -27,8 +27,7 @@ #endif #include #include - -#define __raw_spin_is_locked(x) ((x)->slock != 0) +#include #ifdef CONFIG_PPC64 /* use 0x800000yy when locked, where yy == CPU number */ @@ -50,6 +49,12 @@ #define SYNC_IO #endif +static inline int __raw_spin_is_locked(raw_spinlock_t *lock) +{ + smp_mb(); + return lock->slock != 0; +} + /* * This returns the old value in the lock, so we succeeded * in getting the lock if the return value is 0. @@ -60,7 +65,7 @@ static inline unsigned long arch_spin_tr token = LOCK_TOKEN; __asm__ __volatile__( -"1: lwarx %0,0,%2\n\ +"1: " PPC_LWARX(%0,0,%2,1) "\n\ cmpwi 0,%0,0\n\ bne- 2f\n\ stwcx. %1,0,%2\n\ @@ -186,7 +191,7 @@ static inline long arch_read_trylock(raw long tmp; __asm__ __volatile__( -"1: lwarx %0,0,%1\n" +"1: " PPC_LWARX(%0,0,%1,1) "\n" __DO_SIGN_EXTEND " addic. %0,%0,1\n\ ble- 2f\n" @@ -211,7 +216,7 @@ static inline long arch_write_trylock(ra token = WRLOCK_TOKEN; __asm__ __volatile__( -"1: lwarx %0,0,%2\n\ +"1: " PPC_LWARX(%0,0,%2,1) "\n\ cmpwi 0,%0,0\n\ bne- 2f\n" PPC405_ERR77(0,%1) diff -uprN linux-2.6.32/arch/powerpc/include/asm/syscall.h linux-2.6.32.ovz/arch/powerpc/include/asm/syscall.h --- linux-2.6.32/arch/powerpc/include/asm/syscall.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/syscall.h 2015-03-10 13:22:30.000000000 -0700 @@ -30,7 +30,7 @@ static inline void syscall_rollback(stru static inline long syscall_get_error(struct task_struct *task, struct pt_regs *regs) { - return (regs->ccr & 0x1000) ? -regs->gpr[3] : 0; + return (regs->ccr & 0x10000000) ? -regs->gpr[3] : 0; } static inline long syscall_get_return_value(struct task_struct *task, @@ -44,10 +44,10 @@ static inline void syscall_set_return_va int error, long val) { if (error) { - regs->ccr |= 0x1000L; + regs->ccr |= 0x10000000L; regs->gpr[3] = -error; } else { - regs->ccr &= ~0x1000L; + regs->ccr &= ~0x10000000L; regs->gpr[3] = val; } } diff -uprN linux-2.6.32/arch/powerpc/include/asm/systbl.h linux-2.6.32.ovz/arch/powerpc/include/asm/systbl.h --- linux-2.6.32/arch/powerpc/include/asm/systbl.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/systbl.h 2015-03-10 13:24:10.000000000 -0700 @@ -326,3 +326,33 @@ SYSCALL_SPU(perf_event_open) COMPAT_SYS_SPU(preadv) COMPAT_SYS_SPU(pwritev) COMPAT_SYS(rt_tgsigqueueinfo) +SYSCALL_SPU(ni_syscall) /* fanotify_init */ +SYSCALL_SPU(ni_syscall) /* fanotify_mark */ +SYSCALL_SPU(ni_syscall) /* prlimit64 */ +SYSCALL_SPU(ni_syscall) /* socket */ +SYSCALL_SPU(ni_syscall) /* bind */ +SYSCALL_SPU(ni_syscall) /* connect */ +SYSCALL_SPU(ni_syscall) /* listen */ +SYSCALL_SPU(ni_syscall) /* accept */ +SYSCALL_SPU(ni_syscall) /* getsockname */ +SYSCALL_SPU(ni_syscall) /* getpeername */ +SYSCALL_SPU(ni_syscall) /* socketpair */ +SYSCALL_SPU(ni_syscall) /* send */ +SYSCALL_SPU(ni_syscall) /* sendto */ +SYSCALL_SPU(ni_syscall) /* recv */ +SYSCALL_SPU(ni_syscall) /* recvfrom */ +SYSCALL_SPU(ni_syscall) /* shutdown */ +SYSCALL_SPU(ni_syscall) /* setsockopt */ +SYSCALL_SPU(ni_syscall) /* getsockopt */ +SYSCALL_SPU(ni_syscall) /* sendmsg */ +SYSCALL_SPU(ni_syscall) /* recvmsg */ +SYSCALL_SPU(ni_syscall) /* recvmmsg */ +SYSCALL_SPU(ni_syscall) /* accept4 */ +SYSCALL_SPU(ni_syscall) /* name_to_handle_at */ +SYSCALL_SPU(ni_syscall) /* open_by_handle_at */ +COMPAT_SYS_SPU(clock_adjtime) +SYSCALL_SPU(syncfs) +COMPAT_SYS_SPU(sendmmsg) +SYSCALL_SPU(setns) /* setns */ +COMPAT_SYS(process_vm_readv) +COMPAT_SYS(process_vm_writev) diff -uprN linux-2.6.32/arch/powerpc/include/asm/system.h linux-2.6.32.ovz/arch/powerpc/include/asm/system.h --- linux-2.6.32/arch/powerpc/include/asm/system.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/system.h 2015-03-10 13:23:48.000000000 -0700 @@ -212,7 +212,7 @@ extern struct task_struct *_switch(struc extern unsigned int rtas_data; extern int mem_init_done; /* set on boot once kmalloc can be called */ extern int init_bootmem_done; /* set once bootmem is available */ -extern phys_addr_t memory_limit; +extern unsigned long long memory_limit; extern unsigned long klimit; extern void *alloc_maybe_bootmem(size_t size, gfp_t mask); diff -uprN linux-2.6.32/arch/powerpc/include/asm/thread_info.h linux-2.6.32.ovz/arch/powerpc/include/asm/thread_info.h --- linux-2.6.32/arch/powerpc/include/asm/thread_info.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/thread_info.h 2015-03-10 13:24:12.000000000 -0700 @@ -72,7 +72,7 @@ struct thread_info { #define __HAVE_ARCH_THREAD_INFO_ALLOCATOR -extern struct thread_info *alloc_thread_info(struct task_struct *tsk); +extern struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node); extern void free_thread_info(struct thread_info *ti); #endif /* THREAD_SHIFT < PAGE_SHIFT */ @@ -111,7 +111,6 @@ static inline struct thread_info *curren #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ #define TIF_FREEZE 14 /* Freezing for suspend */ #define TIF_RUNLATCH 15 /* Is the runlatch enabled? */ -#define TIF_ABI_PENDING 16 /* 32/64 bit switch needed */ /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1< static inline int cpu_to_node(int cpu) @@ -57,13 +67,31 @@ static inline int pcibus_to_node(struct .last_balance = jiffies, \ .balance_interval = 1, \ .nr_balance_failed = 0, \ + .max_newidle_lb_cost = 0, \ + .next_decay_max_lb_cost = jiffies, \ } +extern int __node_distance(int, int); +#define node_distance(a, b) __node_distance(a, b) + extern void __init dump_numa_cpu_topology(void); extern int sysfs_add_device_to_node(struct sys_device *dev, int nid); extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid); +#ifdef CONFIG_PPC_SPLPAR +extern int start_topology_update(void); +extern int stop_topology_update(void); +#else +static inline int start_topology_update(void) +{ + return 0; +} +static inline int stop_topology_update(void) +{ + return 0; +} +#endif /* CONFIG_PPC_SPLPAR */ #else static inline int of_node_to_nid(struct device_node *device) @@ -82,7 +110,6 @@ static inline void sysfs_remove_device_f int nid) { } - #endif /* CONFIG_NUMA */ #include diff -uprN linux-2.6.32/arch/powerpc/include/asm/trace_clock.h linux-2.6.32.ovz/arch/powerpc/include/asm/trace_clock.h --- linux-2.6.32/arch/powerpc/include/asm/trace_clock.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/trace_clock.h 2015-03-10 13:23:58.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/powerpc/include/asm/types.h linux-2.6.32.ovz/arch/powerpc/include/asm/types.h --- linux-2.6.32/arch/powerpc/include/asm/types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/types.h 2015-03-10 13:23:18.000000000 -0700 @@ -44,11 +44,6 @@ typedef struct { typedef __vector128 vector128; -#if defined(__powerpc64__) || defined(CONFIG_PHYS_64BIT) -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif typedef u64 dma64_addr_t; typedef struct { diff -uprN linux-2.6.32/arch/powerpc/include/asm/unistd.h linux-2.6.32.ovz/arch/powerpc/include/asm/unistd.h --- linux-2.6.32/arch/powerpc/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/unistd.h 2015-03-10 13:24:10.000000000 -0700 @@ -345,10 +345,40 @@ #define __NR_preadv 320 #define __NR_pwritev 321 #define __NR_rt_tgsigqueueinfo 322 +/* #define __NR_fanotify_init 323 */ +/* #define __NR_fanotify_mark 324 */ +/* #define __NR_prlimit64 325 */ +/* #define __NR_socket 326 */ +/* #define __NR_bind 327 */ +/* #define __NR_connect 328 */ +/* #define __NR_listen 329 */ +/* #define __NR_accept 330 */ +/* #define __NR_getsockname 331 */ +/* #define __NR_getpeername 332 */ +/* #define __NR_socketpair 333 */ +/* #define __NR_send 334 */ +/* #define __NR_sendto 335 */ +/* #define __NR_recv 336 */ +/* #define __NR_recvfrom 337 */ +/* #define __NR_shutdown 338 */ +/* #define __NR_setsockopt 339 */ +/* #define __NR_getsockopt 340 */ +/* #define __NR_sendmsg 341 */ +/* #define __NR_recvmsg 342 */ +/* #define __NR_recvmmsg 343 */ +/* #define __NR_accept4 344 */ +/* #define __NR_name_to_handle_at 345 */ +/* #define __NR_open_by_handle_at 346 */ +#define __NR_clock_adjtime 347 +#define __NR_syncfs 348 +#define __NR_sendmmsg 349 +#define __NR_setns 350 +#define __NR_process_vm_readv 351 +#define __NR_process_vm_writev 352 #ifdef __KERNEL__ -#define __NR_syscalls 323 +#define __NR_syscalls 353 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff -uprN linux-2.6.32/arch/powerpc/include/asm/vdso_datapage.h linux-2.6.32.ovz/arch/powerpc/include/asm/vdso_datapage.h --- linux-2.6.32/arch/powerpc/include/asm/vdso_datapage.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/vdso_datapage.h 2015-03-10 13:21:35.000000000 -0700 @@ -85,6 +85,7 @@ struct vdso_data { __s32 wtom_clock_sec; /* Wall to monotonic clock */ __s32 wtom_clock_nsec; struct timespec stamp_xtime; /* xtime as at tb_orig_stamp */ + __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */ __u32 syscall_map_64[SYSCALL_MAP_SIZE]; /* map of syscalls */ __u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */ }; @@ -105,6 +106,7 @@ struct vdso_data { __s32 wtom_clock_sec; /* Wall to monotonic clock */ __s32 wtom_clock_nsec; struct timespec stamp_xtime; /* xtime as at tb_orig_stamp */ + __u32 stamp_sec_fraction; /* fractional seconds of stamp_xtime */ __u32 syscall_map_32[SYSCALL_MAP_SIZE]; /* map of syscalls */ __u32 dcache_block_size; /* L1 d-cache block size */ __u32 icache_block_size; /* L1 i-cache block size */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/vdso.h linux-2.6.32.ovz/arch/powerpc/include/asm/vdso.h --- linux-2.6.32/arch/powerpc/include/asm/vdso.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/vdso.h 2015-03-10 13:23:56.000000000 -0700 @@ -21,6 +21,7 @@ extern unsigned long vdso64_rt_sigtramp; extern unsigned long vdso32_sigtramp; extern unsigned long vdso32_rt_sigtramp; +int __cpuinit vdso_getcpu_init(void); #else /* __ASSEMBLY__ */ diff -uprN linux-2.6.32/arch/powerpc/include/asm/vio.h linux-2.6.32.ovz/arch/powerpc/include/asm/vio.h --- linux-2.6.32/arch/powerpc/include/asm/vio.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/include/asm/vio.h 2015-03-10 13:23:53.000000000 -0700 @@ -46,6 +46,48 @@ struct iommu_table; +/* + * Platform Facilities Option (PFO)-specific data + */ + +/* Starting unit address for PFO devices on the VIO BUS */ +#define VIO_BASE_PFO_UA 0x50000000 + +/** + * vio_pfo_op - PFO operation parameters + * + * @flags: h_call subfunctions and modifiers + * @in: Input data block logical real address + * @inlen: If non-negative, the length of the input data block. If negative, + * the length of the input data descriptor list in bytes. + * @out: Output data block logical real address + * @outlen: If non-negative, the length of the input data block. If negative, + * the length of the input data descriptor list in bytes. + * @csbcpb: Logical real address of the 4k naturally-aligned storage block + * containing the CSB & optional FC field specific CPB + * @timeout: # of milliseconds to retry h_call, 0 for no timeout. + * @hcall_err: pointer to return the h_call return value, else NULL + */ +struct vio_pfo_op { + u64 flags; + s64 in; + s64 inlen; + s64 out; + s64 outlen; + u64 csbcpb; + void *done; + unsigned long handle; + unsigned int timeout; + long hcall_err; +}; + +/* End PFO specific data */ + +enum vio_dev_family { + VDEVICE, /* The OF node is a child of /vdevice */ + PFO, /* The OF node is a child of /ibm,platform-facilities */ +}; + /** * vio_dev - This structure is used to describe virtual I/O devices. * @@ -58,6 +100,7 @@ struct vio_dev { const char *name; const char *type; uint32_t unit_address; + uint32_t resource_id; unsigned int irq; struct { size_t desired; @@ -65,6 +108,7 @@ struct vio_dev { size_t allocated; atomic_t allocs_failed; } cmo; + enum vio_dev_family family; struct device dev; }; @@ -87,6 +131,8 @@ extern void vio_cmo_set_dev_desired(stru extern void __devinit vio_unregister_device(struct vio_dev *dev); +extern int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op); + struct device_node; extern struct vio_dev *vio_register_device_node( diff -uprN linux-2.6.32/arch/powerpc/Kconfig linux-2.6.32.ovz/arch/powerpc/Kconfig --- linux-2.6.32/arch/powerpc/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/Kconfig 2015-03-10 13:23:44.000000000 -0700 @@ -22,6 +22,9 @@ config WORD_SIZE config ARCH_PHYS_ADDR_T_64BIT def_bool PPC64 || PHYS_64BIT +config ARCH_DMA_ADDR_T_64BIT + def_bool ARCH_PHYS_ADDR_T_64BIT + config MMU bool default y @@ -29,9 +32,6 @@ config MMU config GENERIC_CMOS_UPDATE def_bool y -config GENERIC_TIME - def_bool y - config GENERIC_TIME_VSYSCALL def_bool y @@ -129,7 +129,9 @@ config PPC select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS + select ARCH_HAVE_NMI_SAFE_CMPXCHG config EARLY_PRINTK bool @@ -216,7 +218,7 @@ config ARCH_HIBERNATION_POSSIBLE config ARCH_SUSPEND_POSSIBLE def_bool y - depends on ADB_PMU || PPC_EFIKA || PPC_LITE5200 || PPC_83xx + depends on ADB_PMU || PPC_EFIKA || PPC_LITE5200 || PPC_83xx || PPC_PSERIES config PPC_DCR_NATIVE bool @@ -320,6 +322,10 @@ config HOTPLUG_CPU Say N if you are unsure. +config ARCH_CPU_PROBE_RELEASE + def_bool y + depends on HOTPLUG_CPU + config ARCH_ENABLE_MEMORY_HOTPLUG def_bool y @@ -346,6 +352,17 @@ config KEXEC support. As of this writing the exact hardware interface is strongly in flux, so no good recommendation can be made. +config KEXEC_AUTO_RESERVE + bool "automatically reserve memory for kexec kernel" + depends on KEXEC + default y + ---help--- + Automatically reserve memory for a kexec kernel, so that you don't + need to specify numbers for the "crashkernel=X@Y" boot option, + instead you can use "crashkernel=auto". To make this work, you need + to have more than 2G/8G memory. On PPC, 128M or 256M is reserved, on + PPC64 1/32 of your physical memory, but it will not exceed 4G. + config CRASH_DUMP bool "Build a kdump crash kernel" depends on PPC64 || 6xx @@ -355,13 +372,16 @@ config CRASH_DUMP The same kernel binary can be used as production kernel and dump capture kernel. -config PHYP_DUMP - bool "Hypervisor-assisted dump (EXPERIMENTAL)" - depends on PPC_PSERIES && EXPERIMENTAL - help - Hypervisor-assisted dump is meant to be a kdump replacement - offering robustness and speed not possible without system - hypervisor assistance. +config FA_DUMP + bool "Firmware-assisted dump" + depends on PPC64 && PPC_RTAS && CRASH_DUMP + help + A robust mechanism to get reliable kernel crash dump with + assistance from firmware. This approach does not use kexec, + instead firmware assists in booting the kdump kernel + while preserving memory contents. Firmware-assisted dump + is meant to be a kdump replacement offering robustness and + speed not possible without system firmware assistance. If unsure, say "N" @@ -627,6 +647,9 @@ config ZONE_DMA bool default y +config NEED_DMA_MAP_STATE + def_bool (PPC64 || NOT_COHERENT_CACHE) + config GENERIC_ISA_DMA bool depends on PPC64 || POWER4 || 6xx && !CPM2 diff -uprN linux-2.6.32/arch/powerpc/Kconfig.debug linux-2.6.32.ovz/arch/powerpc/Kconfig.debug --- linux-2.6.32/arch/powerpc/Kconfig.debug 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/Kconfig.debug 2015-03-10 13:23:09.000000000 -0700 @@ -286,4 +286,16 @@ config PPC_EARLY_DEBUG_CPM_ADDR platform probing is done, all platforms selected must share the same address. +config STRICT_DEVMEM + def_bool y + prompt "Filter access to /dev/mem" + help + This option restricts access to /dev/mem. If this option is + disabled, you allow userspace access to all memory, including + kernel and userspace memory. Accidental memory access is likely + to be disastrous. + Memory access is required for experts who want to debug the kernel. + + If you are unsure, say Y. + endmenu diff -uprN linux-2.6.32/arch/powerpc/kernel/align.c linux-2.6.32.ovz/arch/powerpc/kernel/align.c --- linux-2.6.32/arch/powerpc/kernel/align.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/align.c 2015-03-10 13:21:05.000000000 -0700 @@ -642,10 +642,14 @@ static int emulate_spe(struct pt_regs *r */ static int emulate_vsx(unsigned char __user *addr, unsigned int reg, unsigned int areg, struct pt_regs *regs, - unsigned int flags, unsigned int length) + unsigned int flags, unsigned int length, + unsigned int elsize) { char *ptr; + unsigned long *lptr; int ret = 0; + int sw = 0; + int i, j; flush_vsx_to_thread(current); @@ -654,19 +658,35 @@ static int emulate_vsx(unsigned char __u else ptr = (char *) ¤t->thread.vr[reg - 32]; - if (flags & ST) - ret = __copy_to_user(addr, ptr, length); - else { - if (flags & SPLT){ - ret = __copy_from_user(ptr, addr, length); - ptr += length; + lptr = (unsigned long *) ptr; + + if (flags & SW) + sw = elsize-1; + + for (j = 0; j < length; j += elsize) { + for (i = 0; i < elsize; ++i) { + if (flags & ST) + ret |= __put_user(ptr[i^sw], addr + i); + else + ret |= __get_user(ptr[i^sw], addr + i); } - ret |= __copy_from_user(ptr, addr, length); + ptr += elsize; + addr += elsize; } - if (flags & U) - regs->gpr[areg] = regs->dar; - if (ret) + + if (!ret) { + if (flags & U) + regs->gpr[areg] = regs->dar; + + /* Splat load copies the same data to top and bottom 8 bytes */ + if (flags & SPLT) + lptr[1] = lptr[0]; + /* For 8 byte loads, zero the top 8 bytes */ + else if (!(flags & ST) && (8 == length)) + lptr[1] = 0; + } else return -EFAULT; + return 1; } #endif @@ -767,16 +787,25 @@ int fix_alignment(struct pt_regs *regs) #ifdef CONFIG_VSX if ((instruction & 0xfc00003e) == 0x7c000018) { - /* Additional register addressing bit (64 VSX vs 32 FPR/GPR */ + unsigned int elsize; + + /* Additional register addressing bit (64 VSX vs 32 FPR/GPR) */ reg |= (instruction & 0x1) << 5; /* Simple inline decoder instead of a table */ + /* VSX has only 8 and 16 byte memory accesses */ + nb = 8; if (instruction & 0x200) nb = 16; - else if (instruction & 0x080) - nb = 8; - else - nb = 4; + + /* Vector stores in little-endian mode swap individual + elements, so process them separately */ + elsize = 4; + if (instruction & 0x80) + elsize = 8; + flags = 0; + if (regs->msr & MSR_LE) + flags |= SW; if (instruction & 0x100) flags |= ST; if (instruction & 0x040) @@ -787,7 +816,7 @@ int fix_alignment(struct pt_regs *regs) nb = 8; } PPC_WARN_EMULATED(vsx); - return emulate_vsx(addr, reg, areg, regs, flags, nb); + return emulate_vsx(addr, reg, areg, regs, flags, nb, elsize); } #endif /* A size of 0 indicates an instruction we don't support, with diff -uprN linux-2.6.32/arch/powerpc/kernel/asm-offsets.c linux-2.6.32.ovz/arch/powerpc/kernel/asm-offsets.c --- linux-2.6.32/arch/powerpc/kernel/asm-offsets.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/asm-offsets.c 2015-03-10 13:21:47.000000000 -0700 @@ -133,7 +133,6 @@ int main(void) DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); - DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); #ifdef CONFIG_PPC_MM_SLICES DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct, @@ -181,11 +180,13 @@ int main(void) DEFINE(LPPACAANYINT, offsetof(struct lppaca, int_dword.any_int)); DEFINE(LPPACADECRINT, offsetof(struct lppaca, int_dword.fields.decr_int)); DEFINE(SLBSHADOW_SAVEAREA, offsetof(struct slb_shadow, save_area)); + DEFINE(LPPACA_DTLIDX, offsetof(struct lppaca, dtl_idx)); + DEFINE(PACA_DTL_RIDX, offsetof(struct paca_struct, dtl_ridx)); #endif /* CONFIG_PPC_STD_MMU_64 */ DEFINE(PACAEMERGSP, offsetof(struct paca_struct, emergency_sp)); DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id)); - DEFINE(PACA_STARTPURR, offsetof(struct paca_struct, startpurr)); - DEFINE(PACA_STARTSPURR, offsetof(struct paca_struct, startspurr)); + DEFINE(PACA_STARTTIME, offsetof(struct paca_struct, starttime)); + DEFINE(PACA_STARTTIME_USER, offsetof(struct paca_struct, starttime_user)); DEFINE(PACA_USER_TIME, offsetof(struct paca_struct, user_time)); DEFINE(PACA_SYSTEM_TIME, offsetof(struct paca_struct, system_time)); DEFINE(PACA_DATA_OFFSET, offsetof(struct paca_struct, data_offset)); @@ -331,6 +332,7 @@ int main(void) DEFINE(WTOM_CLOCK_SEC, offsetof(struct vdso_data, wtom_clock_sec)); DEFINE(WTOM_CLOCK_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); DEFINE(STAMP_XTIME, offsetof(struct vdso_data, stamp_xtime)); + DEFINE(STAMP_SEC_FRAC, offsetof(struct vdso_data, stamp_sec_fraction)); DEFINE(CFG_ICACHE_BLOCKSZ, offsetof(struct vdso_data, icache_block_size)); DEFINE(CFG_DCACHE_BLOCKSZ, offsetof(struct vdso_data, dcache_block_size)); DEFINE(CFG_ICACHE_LOGBLOCKSZ, offsetof(struct vdso_data, icache_log_block_size)); diff -uprN linux-2.6.32/arch/powerpc/kernel/cacheinfo.c linux-2.6.32.ovz/arch/powerpc/kernel/cacheinfo.c --- linux-2.6.32/arch/powerpc/kernel/cacheinfo.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/cacheinfo.c 2015-03-10 13:21:34.000000000 -0700 @@ -642,7 +642,7 @@ static struct kobj_attribute *cache_inde &cache_assoc_attr, }; -static struct sysfs_ops cache_index_ops = { +static const struct sysfs_ops cache_index_ops = { .show = cache_index_show, }; diff -uprN linux-2.6.32/arch/powerpc/kernel/cputable.c linux-2.6.32.ovz/arch/powerpc/kernel/cputable.c --- linux-2.6.32/arch/powerpc/kernel/cputable.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/cputable.c 2015-03-10 13:23:57.000000000 -0700 @@ -440,6 +440,14 @@ static struct cpu_spec __initdata cpu_sp .oprofile_cpu_type = "ppc64/ibm-compat-v1", .platform = "power7", }, + { /* 2.07-compliant processor, i.e. Power8 "architected" mode */ + .pvr_mask = 0xffffffff, + .pvr_value = 0x0f000004, + .cpu_name = "POWER8 (architected)", + .oprofile_type = PPC_OPROFILE_INVALID, + .oprofile_cpu_type = "ppc64/ibm-compat-v1", + .platform = "power8", + }, { /* Power7 */ .pvr_mask = 0xffff0000, .pvr_value = 0x003f0000, @@ -462,6 +470,31 @@ static struct cpu_spec __initdata cpu_sp POWER6_MMCRA_OTHER, .platform = "power7", }, + { /* Power7+ */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004A0000, + .cpu_name = "POWER7+ (raw)", + .cpu_features = CPU_FTRS_POWER7, + .cpu_user_features = COMMON_USER_POWER7, + .mmu_features = MMU_FTR_HPTE_TABLE | + MMU_FTR_TLBIE_206, + .icache_bsize = 128, + .dcache_bsize = 128, + .num_pmcs = 6, + .pmc_type = PPC_PMC_IBM, + .oprofile_cpu_type = "ppc64/power7", + .oprofile_type = PPC_OPROFILE_POWER4, + .platform = "power7+", + }, + { /* Power8 */ + .pvr_mask = 0xffff0000, + .pvr_value = 0x004b0000, + .cpu_name = "POWER8 (raw)", + .num_pmcs = 6, + .oprofile_cpu_type = "ppc64/power8", + .oprofile_type = PPC_OPROFILE_INVALID, + .platform = "power8", + }, { /* Cell Broadband Engine */ .pvr_mask = 0xffff0000, .pvr_value = 0x00700000, diff -uprN linux-2.6.32/arch/powerpc/kernel/crash.c linux-2.6.32.ovz/arch/powerpc/kernel/crash.c --- linux-2.6.32/arch/powerpc/kernel/crash.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/crash.c 2015-03-10 13:22:36.000000000 -0700 @@ -162,6 +162,33 @@ static void crash_kexec_prepare_cpus(int /* Leave the IPI callback set */ } +/* wait for all the CPUs to hit real mode but timeout if they don't come in */ +static void crash_kexec_wait_realmode(int cpu) +{ + extern u8 kexec_state[NR_CPUS]; + unsigned int msecs; + int i; + + msecs = 10000; + for (i=0; i < NR_CPUS && msecs > 0; i++) { + if (i == cpu) + continue; + + while (kexec_state[i] < KEXEC_STATE_REAL_MODE) { + barrier(); + if (!cpu_possible(i)) { + break; + } + if (!cpu_online(i)) { + break; + } + msecs--; + mdelay(1); + } + } + mb(); +} + /* * This function will be called by secondary cpus or by kexec cpu * if soft-reset is activated to stop some CPUs. @@ -347,10 +374,12 @@ int crash_shutdown_unregister(crash_shut EXPORT_SYMBOL(crash_shutdown_unregister); static unsigned long crash_shutdown_buf[JMP_BUF_LEN]; +static int crash_shutdown_cpu = -1; static int handle_fault(struct pt_regs *regs) { - longjmp(crash_shutdown_buf, 1); + if (crash_shutdown_cpu == smp_processor_id()) + longjmp(crash_shutdown_buf, 1); return 0; } @@ -388,6 +417,7 @@ void default_machine_crash_shutdown(stru */ old_handler = __debugger_fault_handler; __debugger_fault_handler = handle_fault; + crash_shutdown_cpu = smp_processor_id(); for (i = 0; crash_shutdown_handles[i]; i++) { if (setjmp(crash_shutdown_buf) == 0) { /* @@ -401,6 +431,7 @@ void default_machine_crash_shutdown(stru asm volatile("sync; isync"); } } + crash_shutdown_cpu = -1; __debugger_fault_handler = old_handler; /* @@ -412,6 +443,7 @@ void default_machine_crash_shutdown(stru crash_kexec_prepare_cpus(crashing_cpu); cpu_set(crashing_cpu, cpus_in_crash); crash_kexec_stop_spus(); + crash_kexec_wait_realmode(crashing_cpu); if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(1, 0); } diff -uprN linux-2.6.32/arch/powerpc/kernel/crash_dump.c linux-2.6.32.ovz/arch/powerpc/kernel/crash_dump.c --- linux-2.6.32/arch/powerpc/kernel/crash_dump.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/crash_dump.c 2015-03-10 13:22:16.000000000 -0700 @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef DEBUG #include @@ -141,3 +142,35 @@ ssize_t copy_oldmem_page(unsigned long p return csize; } + +#ifdef CONFIG_PPC_RTAS +/* + * The crashkernel region will almost always overlap the RTAS region, so + * we have to be careful when shrinking the crashkernel region. + */ +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + const u32 *basep, *sizep; + unsigned int rtas_start = 0, rtas_end = 0; + + basep = of_get_property(rtas.dev, "linux,rtas-base", NULL); + sizep = of_get_property(rtas.dev, "rtas-size", NULL); + + if (basep && sizep) { + rtas_start = *basep; + rtas_end = *basep + *sizep; + } + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + /* Does this page overlap with the RTAS region? */ + if (addr <= rtas_end && ((addr + PAGE_SIZE) > rtas_start)) + continue; + + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} +#endif diff -uprN linux-2.6.32/arch/powerpc/kernel/dma.c linux-2.6.32.ovz/arch/powerpc/kernel/dma.c --- linux-2.6.32/arch/powerpc/kernel/dma.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/dma.c 2015-03-10 13:22:05.000000000 -0700 @@ -11,6 +11,7 @@ #include #include #include +#include /* * Generic direct DMA implementation @@ -153,6 +154,23 @@ EXPORT_SYMBOL(dma_direct_ops); #define PREALLOC_DMA_DEBUG_ENTRIES (1 << 16) +int dma_set_mask(struct device *dev, u64 dma_mask) +{ + struct dma_map_ops *dma_ops = get_dma_ops(dev); + + if (ppc_md.dma_set_mask) + return ppc_md.dma_set_mask(dev, dma_mask); + if (unlikely(dma_ops == NULL)) + return -EIO; + if (dma_ops->set_dma_mask != NULL) + return dma_ops->set_dma_mask(dev, dma_mask); + if (!dev->dma_mask || !dma_supported(dev, dma_mask)) + return -EIO; + *dev->dma_mask = dma_mask; + return 0; +} +EXPORT_SYMBOL(dma_set_mask); + static int __init dma_init(void) { dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); diff -uprN linux-2.6.32/arch/powerpc/kernel/e500-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/e500-pmu.c --- linux-2.6.32/arch/powerpc/kernel/e500-pmu.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/e500-pmu.c 2015-03-10 13:22:06.000000000 -0700 @@ -0,0 +1,129 @@ +/* + * Performance counter support for e500 family processors. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * Copyright 2010 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include + +/* + * Map of generic hardware event types to hardware events + * Zero if unsupported + */ +static int e500_generic_events[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 1, + [PERF_COUNT_HW_INSTRUCTIONS] = 2, + [PERF_COUNT_HW_CACHE_MISSES] = 41, /* Data L1 cache reloads */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 12, + [PERF_COUNT_HW_BRANCH_MISSES] = 15, +}; + +#define C(x) PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int e500_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { + /* + * D-cache misses are not split into read/write/prefetch; + * use raw event 41. + */ + [C(L1D)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 27, 0 }, + [C(OP_WRITE)] = { 28, 0 }, + [C(OP_PREFETCH)] = { 29, 0 }, + }, + [C(L1I)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 2, 60 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + /* + * Assuming LL means L2, it's not a good match for this model. + * It allocates only on L1 castout or explicit prefetch, and + * does not have separate read/write events (but it does have + * separate instruction/data events). + */ + [C(LL)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 0, 0 }, + [C(OP_WRITE)] = { 0, 0 }, + [C(OP_PREFETCH)] = { 0, 0 }, + }, + /* + * There are data/instruction MMU misses, but that's a miss on + * the chip's internal level-one TLB which is probably not + * what the user wants. Instead, unified level-two TLB misses + * are reported here. + */ + [C(DTLB)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 26, 66 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, + [C(BPU)] = { /* RESULT_ACCESS RESULT_MISS */ + [C(OP_READ)] = { 12, 15 }, + [C(OP_WRITE)] = { -1, -1 }, + [C(OP_PREFETCH)] = { -1, -1 }, + }, +}; + +static int num_events = 128; + +/* Upper half of event id is PMLCb, for threshold events */ +static u64 e500_xlate_event(u64 event_id) +{ + u32 event_low = (u32)event_id; + u64 ret; + + if (event_low >= num_events) + return 0; + + ret = FSL_EMB_EVENT_VALID; + + if (event_low >= 76 && event_low <= 81) { + ret |= FSL_EMB_EVENT_RESTRICTED; + ret |= event_id & + (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH); + } else if (event_id & + (FSL_EMB_EVENT_THRESHMUL | FSL_EMB_EVENT_THRESH)) { + /* Threshold requested on non-threshold event */ + return 0; + } + + return ret; +} + +static struct fsl_emb_pmu e500_pmu = { + .name = "e500 family", + .n_counter = 4, + .n_restricted = 2, + .xlate_event = e500_xlate_event, + .n_generic = ARRAY_SIZE(e500_generic_events), + .generic_events = e500_generic_events, + .cache_events = &e500_cache_events, +}; + +static int init_e500_pmu(void) +{ + if (!cur_cpu_spec->oprofile_cpu_type) + return -ENODEV; + + if (!strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500mc")) + num_events = 256; + else if (strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc/e500")) + return -ENODEV; + + return register_fsl_emb_pmu(&e500_pmu); +} + +early_initcall(init_e500_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/entry_64.S linux-2.6.32.ovz/arch/powerpc/kernel/entry_64.S --- linux-2.6.32/arch/powerpc/kernel/entry_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/entry_64.S 2015-03-10 13:21:47.000000000 -0700 @@ -97,6 +97,24 @@ system_call_common: addi r9,r1,STACK_FRAME_OVERHEAD ld r11,exception_marker@toc(r2) std r11,-16(r9) /* "regshere" marker */ +#if defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR) +BEGIN_FW_FTR_SECTION + beq 33f + /* if from user, see if there are any DTL entries to process */ + ld r10,PACALPPACAPTR(r13) /* get ptr to VPA */ + ld r11,PACA_DTL_RIDX(r13) /* get log read index */ + ld r10,LPPACA_DTLIDX(r10) /* get log write index */ + cmpd cr1,r11,r10 + beq+ cr1,33f + bl .accumulate_stolen_time + REST_GPR(0,r1) + REST_4GPRS(3,r1) + REST_2GPRS(7,r1) + addi r9,r1,STACK_FRAME_OVERHEAD +33: +END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) +#endif /* CONFIG_VIRT_CPU_ACCOUNTING && CONFIG_PPC_SPLPAR */ + #ifdef CONFIG_TRACE_IRQFLAGS bl .trace_hardirqs_on REST_GPR(0,r1) @@ -556,15 +574,6 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ 2: TRACE_AND_RESTORE_IRQ(r5); -#ifdef CONFIG_PERF_EVENTS - /* check paca->perf_event_pending if we're enabling ints */ - lbz r3,PACAPERFPEND(r13) - and. r3,r3,r5 - beq 27f - bl .perf_event_do_pending -27: -#endif /* CONFIG_PERF_EVENTS */ - /* extract EE bit and use it to restore paca->hard_enabled */ ld r3,_MSR(r1) rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ diff -uprN linux-2.6.32/arch/powerpc/kernel/fadump.c linux-2.6.32.ovz/arch/powerpc/kernel/fadump.c --- linux-2.6.32/arch/powerpc/kernel/fadump.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/fadump.c 2015-03-10 13:23:48.000000000 -0700 @@ -0,0 +1,1321 @@ +/* + * Firmware Assisted dump: A robust mechanism to get reliable kernel crash + * dump with assistance from firmware. This approach does not use kexec, + * instead firmware assists in booting the kdump kernel while preserving + * memory contents. The most of the code implementation has been adapted + * from phyp assisted dump implementation written by Linas Vepstas and + * Manish Ahuja + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright 2011 IBM Corporation + * Author: Mahesh Salgaonkar + */ + +#undef DEBUG +#define pr_fmt(fmt) "fadump: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * The RTAS property "ibm,configure-kernel-dump-sizes" returns dump + * sizes for the firmware provided dump sections (cpu state data + * and hpte region). + * For each dump section type supported, a 32bit cell which defines + * the ID of a supported section followed by two 32 bit cells which + * gives teh size of the section in bytes. + */ +struct dump_section { + u32 dump_section; + unsigned long section_size; +} __packed; + +static struct fw_dump fw_dump; +static struct fadump_mem_struct fdm; +static const struct fadump_mem_struct *fdm_active; + +static DEFINE_MUTEX(fadump_mutex); +struct fad_crash_memory_ranges crash_memory_ranges[INIT_CRASHMEM_RANGES]; +int crash_mem_ranges; + +/* Scan the Firmware Assisted dump configuration details. */ +int __init early_init_dt_scan_fw_dump(unsigned long node, + const char *uname, int depth, void *data) +{ + const struct dump_section *sections; + int i, num_sections; + unsigned long size; + const int *token; + + if (depth != 1 || strcmp(uname, "rtas") != 0) + return 0; + + /* + * Check if Firmware Assisted dump is supported. if yes, check + * if dump has been initiated on last reboot. + */ + token = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL); + if (!token) + return 0; + + fw_dump.fadump_supported = 1; + fw_dump.ibm_configure_kernel_dump = *token; + + /* + * The 'ibm,kernel-dump' rtas node is present only if there is + * dump data waiting for us. + */ + fdm_active = of_get_flat_dt_prop(node, "ibm,kernel-dump", NULL); + if (fdm_active) + fw_dump.dump_active = 1; + + /* Get the sizes required to store dump data for the firmware provided + * dump sections. + */ + sections = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + &size); + + if (!sections) + return 0; + + num_sections = size / sizeof(struct dump_section); + + for (i = 0; i < num_sections; i++) { + switch (sections[i].dump_section) { + case FADUMP_CPU_STATE_DATA: + fw_dump.cpu_state_data_size = sections[i].section_size; + break; + case FADUMP_HPTE_REGION: + fw_dump.hpte_region_size = sections[i].section_size; + break; + } + } + return 1; +} + +int is_fadump_active(void) +{ + return fw_dump.dump_active; +} + +/* Print firmware assisted dump configurations for debugging purpose. */ +static void fadump_show_config(void) +{ + pr_debug("Support for firmware-assisted dump (fadump): %s\n", + (fw_dump.fadump_supported ? "present" : "no support")); + + if (!fw_dump.fadump_supported) + return; + + pr_debug("Fadump enabled : %s\n", + (fw_dump.fadump_enabled ? "yes" : "no")); + pr_debug("Dump Active : %s\n", + (fw_dump.dump_active ? "yes" : "no")); + pr_debug("Dump section sizes:\n"); + pr_debug(" CPU state data size: %lx\n", fw_dump.cpu_state_data_size); + pr_debug(" HPTE region size : %lx\n", fw_dump.hpte_region_size); + pr_debug("Boot memory size : %lx\n", fw_dump.boot_memory_size); +} + +static unsigned long init_fadump_mem_struct(struct fadump_mem_struct *fdm, + unsigned long addr) +{ + if (!fdm) + return 0; + + memset(fdm, 0, sizeof(struct fadump_mem_struct)); + addr = addr & PAGE_MASK; + + fdm->header.dump_format_version = 0x00000001; + fdm->header.dump_num_sections = 3; + fdm->header.dump_status_flag = 0; + fdm->header.offset_first_dump_section = + (u32)offsetof(struct fadump_mem_struct, cpu_state_data); + + /* + * Fields for disk dump option. + * We are not using disk dump option, hence set these fields to 0. + */ + fdm->header.dd_block_size = 0; + fdm->header.dd_block_offset = 0; + fdm->header.dd_num_blocks = 0; + fdm->header.dd_offset_disk_path = 0; + + /* set 0 to disable an automatic dump-reboot. */ + fdm->header.max_time_auto = 0; + + /* Kernel dump sections */ + /* cpu state data section. */ + fdm->cpu_state_data.request_flag = FADUMP_REQUEST_FLAG; + fdm->cpu_state_data.source_data_type = FADUMP_CPU_STATE_DATA; + fdm->cpu_state_data.source_address = 0; + fdm->cpu_state_data.source_len = fw_dump.cpu_state_data_size; + fdm->cpu_state_data.destination_address = addr; + addr += fw_dump.cpu_state_data_size; + + /* hpte region section */ + fdm->hpte_region.request_flag = FADUMP_REQUEST_FLAG; + fdm->hpte_region.source_data_type = FADUMP_HPTE_REGION; + fdm->hpte_region.source_address = 0; + fdm->hpte_region.source_len = fw_dump.hpte_region_size; + fdm->hpte_region.destination_address = addr; + addr += fw_dump.hpte_region_size; + + /* RMA region section */ + fdm->rmr_region.request_flag = FADUMP_REQUEST_FLAG; + fdm->rmr_region.source_data_type = FADUMP_REAL_MODE_REGION; + fdm->rmr_region.source_address = RMA_START; + fdm->rmr_region.source_len = fw_dump.boot_memory_size; + fdm->rmr_region.destination_address = addr; + addr += fw_dump.boot_memory_size; + + return addr; +} + +/** + * fadump_calculate_reserve_size(): reserve variable boot area 5% of System RAM + * + * Function to find the largest memory size we need to reserve during early + * boot process. This will be the size of the memory that is required for a + * kernel to boot successfully. + * + * This function has been taken from phyp-assisted dump feature implementation. + * + * returns larger of 256MB or 5% rounded down to multiples of 256MB. + * + * TODO: Come up with better approach to find out more accurate memory size + * that is required for a kernel to boot successfully. + * + */ +static inline unsigned long fadump_calculate_reserve_size(void) +{ + unsigned long size; + + /* + * Check if the size is specified through fadump_reserve_mem= cmdline + * option. If yes, then use that. + */ + if (fw_dump.reserve_bootvar) + return fw_dump.reserve_bootvar; + + /* divide by 20 to get 5% of value */ + size = lmb_end_of_DRAM() / 20; + + /* round it down in multiples of 256 */ + size = size & ~0x0FFFFFFFUL; + + /* Truncate to memory_limit. We don't want to over reserve the memory.*/ + if (memory_limit && size > memory_limit) + size = memory_limit; + + return (size > MIN_BOOT_MEM ? size : MIN_BOOT_MEM); +} + +/* + * Calculate the total memory size required to be reserved for + * firmware-assisted dump registration. + */ +static unsigned long get_fadump_area_size(void) +{ + unsigned long size = 0; + + size += fw_dump.cpu_state_data_size; + size += fw_dump.hpte_region_size; + size += fw_dump.boot_memory_size; + size += sizeof(struct fadump_crash_info_header); + size += sizeof(struct elfhdr); /* ELF core header.*/ + size += sizeof(struct elf_phdr); /* place holder for cpu notes */ + /* Program headers for crash memory regions. */ + size += sizeof(struct elf_phdr) * (lmb_num_regions(memory) + 2); + + size = PAGE_ALIGN(size); + return size; +} + +int __init fadump_reserve_mem(void) +{ + unsigned long base, size, memory_boundary; + + if (!fw_dump.fadump_enabled) + return 0; + + if (!fw_dump.fadump_supported) { + printk(KERN_INFO "Firmware-assisted dump is not supported on" + " this hardware\n"); + fw_dump.fadump_enabled = 0; + return 0; + } + /* + * Initialize boot memory size + * If dump is active then we have already calculated the size during + * first kernel. + */ + if (fdm_active) + fw_dump.boot_memory_size = fdm_active->rmr_region.source_len; + else + fw_dump.boot_memory_size = fadump_calculate_reserve_size(); + + /* + * Calculate the memory boundary. + * If memory_limit is less than actual memory boundary then reserve + * the memory for fadump beyond the memory_limit and adjust the + * memory_limit accordingly, so that the running kernel can run with + * specified memory_limit. + */ + if (memory_limit && memory_limit < lmb_end_of_DRAM()) { + size = get_fadump_area_size(); + if ((memory_limit + size) < lmb_end_of_DRAM()) + memory_limit += size; + else + memory_limit = lmb_end_of_DRAM(); + printk(KERN_INFO "Adjusted memory_limit for firmware-assisted" + " dump, now %#016llx\n", memory_limit); + } + if (memory_limit) + memory_boundary = memory_limit; + else + memory_boundary = lmb_end_of_DRAM(); + + if (fw_dump.dump_active) { + printk(KERN_INFO "Firmware-assisted dump is active.\n"); + /* + * If last boot has crashed then reserve all the memory + * above boot_memory_size so that we don't touch it until + * dump is written to disk by userspace tool. This memory + * will be released for general use once the dump is saved. + */ + base = fw_dump.boot_memory_size; + size = memory_boundary - base; + lmb_reserve(base, size); + printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " + "for saving crash dump\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20)); + + fw_dump.fadumphdr_addr = + fdm_active->rmr_region.destination_address + + fdm_active->rmr_region.source_len; + pr_debug("fadumphdr_addr = %p\n", + (void *) fw_dump.fadumphdr_addr); + } else { + /* Reserve the memory at the top of memory. */ + size = get_fadump_area_size(); + base = memory_boundary - size; + lmb_reserve(base, size); + printk(KERN_INFO "Reserved %ldMB of memory at %ldMB " + "for firmware-assisted dump\n", + (unsigned long)(size >> 20), + (unsigned long)(base >> 20)); + } + fw_dump.reserve_dump_area_start = base; + fw_dump.reserve_dump_area_size = size; + return 1; +} + +/* Look for fadump= cmdline option. */ +static int __init early_fadump_param(char *p) +{ + if (!p) + return 1; + + if (strncmp(p, "on", 2) == 0) + fw_dump.fadump_enabled = 1; + else if (strncmp(p, "off", 3) == 0) + fw_dump.fadump_enabled = 0; + + return 0; +} +early_param("fadump", early_fadump_param); + +/* Look for fadump_reserve_mem= cmdline option */ +static int __init early_fadump_reserve_mem(char *p) +{ + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); + +static void register_fw_dump(struct fadump_mem_struct *fdm) +{ + int rc; + unsigned int wait_time; + + pr_debug("Registering for firmware-assisted kernel dump...\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_REGISTER, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + + } while (wait_time); + + switch (rc) { + case -1: + printk(KERN_ERR "Failed to register firmware-assisted kernel" + " dump. Hardware Error(%d).\n", rc); + break; + case -3: + printk(KERN_ERR "Failed to register firmware-assisted kernel" + " dump. Parameter Error(%d).\n", rc); + break; + case -9: + printk(KERN_ERR "firmware-assisted kernel dump is already " + " registered."); + fw_dump.dump_registered = 1; + break; + case 0: + printk(KERN_INFO "firmware-assisted kernel dump registration" + " is successful\n"); + fw_dump.dump_registered = 1; + break; + } +} + +void crash_fadump(struct pt_regs *regs, const char *str) +{ + struct fadump_crash_info_header *fdh = NULL; + + if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) + return; + + fdh = __va(fw_dump.fadumphdr_addr); + crashing_cpu = smp_processor_id(); + fdh->crashing_cpu = crashing_cpu; + crash_save_vmcoreinfo(); + + if (regs) + fdh->regs = *regs; + else + ppc_save_regs(&fdh->regs); + + fdh->cpu_online_mask = *cpu_online_mask; + + /* Call ibm,os-term rtas call to trigger firmware assisted dump */ + rtas_os_term((char *)str); +} + +#define GPR_MASK 0xffffff0000000000 +static inline int fadump_gpr_index(u64 id) +{ + int i = -1; + char str[3]; + + if ((id & GPR_MASK) == REG_ID("GPR")) { + /* get the digits at the end */ + id &= ~GPR_MASK; + id >>= 24; + str[2] = '\0'; + str[1] = id & 0xff; + str[0] = (id >> 8) & 0xff; + sscanf(str, "%d", &i); + if (i > 31) + i = -1; + } + return i; +} + +static inline void fadump_set_regval(struct pt_regs *regs, u64 reg_id, + u64 reg_val) +{ + int i; + + i = fadump_gpr_index(reg_id); + if (i >= 0) + regs->gpr[i] = (unsigned long)reg_val; + else if (reg_id == REG_ID("NIA")) + regs->nip = (unsigned long)reg_val; + else if (reg_id == REG_ID("MSR")) + regs->msr = (unsigned long)reg_val; + else if (reg_id == REG_ID("CTR")) + regs->ctr = (unsigned long)reg_val; + else if (reg_id == REG_ID("LR")) + regs->link = (unsigned long)reg_val; + else if (reg_id == REG_ID("XER")) + regs->xer = (unsigned long)reg_val; + else if (reg_id == REG_ID("CR")) + regs->ccr = (unsigned long)reg_val; + else if (reg_id == REG_ID("DAR")) + regs->dar = (unsigned long)reg_val; + else if (reg_id == REG_ID("DSISR")) + regs->dsisr = (unsigned long)reg_val; +} + +static struct fadump_reg_entry* +fadump_read_registers(struct fadump_reg_entry *reg_entry, struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + + while (reg_entry->reg_id != REG_ID("CPUEND")) { + fadump_set_regval(regs, reg_entry->reg_id, + reg_entry->reg_value); + reg_entry++; + } + reg_entry++; + return reg_entry; +} + +static u32 *fadump_append_elf_note(u32 *buf, char *name, unsigned type, + void *data, size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void fadump_final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +static u32 *fadump_regs_to_elf_notes(u32 *buf, struct pt_regs *regs) +{ + struct elf_prstatus prstatus; + + memset(&prstatus, 0, sizeof(prstatus)); + /* + * FIXME: How do i get PID? Do I really need it? + * prstatus.pr_pid = ???? + */ + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = fadump_append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + return buf; +} + +static void fadump_update_elfcore_header(char *bufp) +{ + struct elfhdr *elf; + struct elf_phdr *phdr; + + elf = (struct elfhdr *)bufp; + bufp += sizeof(struct elfhdr); + + /* First note is a place holder for cpu notes info. */ + phdr = (struct elf_phdr *)bufp; + + if (phdr->p_type == PT_NOTE) { + phdr->p_paddr = fw_dump.cpu_notes_buf; + phdr->p_offset = phdr->p_paddr; + phdr->p_filesz = fw_dump.cpu_notes_buf_size; + phdr->p_memsz = fw_dump.cpu_notes_buf_size; + } + return; +} + +static void *fadump_cpu_notes_buf_alloc(unsigned long size) +{ + void *vaddr; + struct page *page; + unsigned long order, count, i; + + order = get_order(size); + vaddr = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + if (!vaddr) + return NULL; + + count = 1 << order; + page = virt_to_page(vaddr); + for (i = 0; i < count; i++) + SetPageReserved(page + i); + return vaddr; +} + +static void fadump_cpu_notes_buf_free(unsigned long vaddr, unsigned long size) +{ + struct page *page; + unsigned long order, count, i; + + order = get_order(size); + count = 1 << order; + page = virt_to_page(vaddr); + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +/* + * Read CPU state dump data and convert it into ELF notes. + * The CPU dump starts with magic number "REGSAVE". NumCpusOffset should be + * used to access the data to allow for additional fields to be added without + * affecting compatibility. Each list of registers for a CPU starts with + * "CPUSTRT" and ends with "CPUEND". Each register entry is of 16 bytes, + * 8 Byte ASCII identifier and 8 Byte register value. The register entry + * with identifier "CPUSTRT" and "CPUEND" contains 4 byte cpu id as part + * of register value. For more details refer to PAPR document. + * + * Only for the crashing cpu we ignore the CPU dump data and get exact + * state from fadump crash info structure populated by first kernel at the + * time of crash. + */ +static int __init fadump_build_cpu_notes(const struct fadump_mem_struct *fdm) +{ + struct fadump_reg_save_area_header *reg_header; + struct fadump_reg_entry *reg_entry; + struct fadump_crash_info_header *fdh = NULL; + void *vaddr; + unsigned long addr; + u32 num_cpus, *note_buf; + struct pt_regs regs; + int i, rc = 0, cpu = 0; + + if (!fdm->cpu_state_data.bytes_dumped) + return -EINVAL; + + addr = fdm->cpu_state_data.destination_address; + vaddr = __va(addr); + + reg_header = vaddr; + if (reg_header->magic_number != REGSAVE_AREA_MAGIC) { + printk(KERN_ERR "Unable to read register save area.\n"); + return -ENOENT; + } + pr_debug("--------CPU State Data------------\n"); + pr_debug("Magic Number: %llx\n", reg_header->magic_number); + pr_debug("NumCpuOffset: %x\n", reg_header->num_cpu_offset); + + vaddr += reg_header->num_cpu_offset; + num_cpus = *((u32 *)(vaddr)); + pr_debug("NumCpus : %u\n", num_cpus); + vaddr += sizeof(u32); + reg_entry = (struct fadump_reg_entry *)vaddr; + + /* Allocate buffer to hold cpu crash notes. */ + fw_dump.cpu_notes_buf_size = num_cpus * sizeof(note_buf_t); + fw_dump.cpu_notes_buf_size = PAGE_ALIGN(fw_dump.cpu_notes_buf_size); + note_buf = fadump_cpu_notes_buf_alloc(fw_dump.cpu_notes_buf_size); + if (!note_buf) { + printk(KERN_ERR "Failed to allocate 0x%lx bytes for " + "cpu notes buffer\n", fw_dump.cpu_notes_buf_size); + return -ENOMEM; + } + fw_dump.cpu_notes_buf = __pa(note_buf); + + pr_debug("Allocated buffer for cpu notes of size %ld at %p\n", + (num_cpus * sizeof(note_buf_t)), note_buf); + + if (fw_dump.fadumphdr_addr) + fdh = __va(fw_dump.fadumphdr_addr); + + for (i = 0; i < num_cpus; i++) { + if (reg_entry->reg_id != REG_ID("CPUSTRT")) { + printk(KERN_ERR "Unable to read CPU state data\n"); + rc = -ENOENT; + goto error_out; + } + /* Lower 4 bytes of reg_value contains logical cpu id */ + cpu = reg_entry->reg_value & FADUMP_CPU_ID_MASK; + if (!cpumask_test_cpu(cpu, &fdh->cpu_online_mask)) { + SKIP_TO_NEXT_CPU(reg_entry); + continue; + } + pr_debug("Reading register data for cpu %d...\n", cpu); + if (fdh && fdh->crashing_cpu == cpu) { + regs = fdh->regs; + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + SKIP_TO_NEXT_CPU(reg_entry); + } else { + reg_entry++; + reg_entry = fadump_read_registers(reg_entry, ®s); + note_buf = fadump_regs_to_elf_notes(note_buf, ®s); + } + } + fadump_final_note(note_buf); + + pr_debug("Updating elfcore header (%llx) with cpu notes\n", + fdh->elfcorehdr_addr); + fadump_update_elfcore_header((char *)__va(fdh->elfcorehdr_addr)); + return 0; + +error_out: + fadump_cpu_notes_buf_free((unsigned long)__va(fw_dump.cpu_notes_buf), + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf = 0; + fw_dump.cpu_notes_buf_size = 0; + return rc; + +} + +/* + * Validate and process the dump data stored by firmware before exporting + * it through '/proc/vmcore'. + */ +static int __init process_fadump(const struct fadump_mem_struct *fdm_active) +{ + struct fadump_crash_info_header *fdh; + int rc = 0; + + if (!fdm_active || !fw_dump.fadumphdr_addr) + return -EINVAL; + + /* Check if the dump data is valid. */ + if ((fdm_active->header.dump_status_flag == FADUMP_ERROR_FLAG) || + (fdm_active->cpu_state_data.error_flags != 0) || + (fdm_active->rmr_region.error_flags != 0)) { + printk(KERN_ERR "Dump taken by platform is not valid\n"); + return -EINVAL; + } + if ((fdm_active->rmr_region.bytes_dumped != + fdm_active->rmr_region.source_len) || + !fdm_active->cpu_state_data.bytes_dumped) { + printk(KERN_ERR "Dump taken by platform is incomplete\n"); + return -EINVAL; + } + + /* Validate the fadump crash info header */ + fdh = __va(fw_dump.fadumphdr_addr); + if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { + printk(KERN_ERR "Crash info header is not valid.\n"); + return -EINVAL; + } + + rc = fadump_build_cpu_notes(fdm_active); + if (rc) + return rc; + + /* + * We are done validating dump info and elfcore header is now ready + * to be exported. set elfcorehdr_addr so that vmcore module will + * export the elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + + return 0; +} + +static inline void fadump_add_crash_memory(unsigned long long base, + unsigned long long end) +{ + if (base == end) + return; + + pr_debug("crash_memory_range[%d] [%#016llx-%#016llx], %#llx bytes\n", + crash_mem_ranges, base, end - 1, (end - base)); + crash_memory_ranges[crash_mem_ranges].base = base; + crash_memory_ranges[crash_mem_ranges].size = end - base; + crash_mem_ranges++; +} + +static void fadump_exclude_reserved_area(unsigned long long start, + unsigned long long end) +{ + unsigned long long ra_start, ra_end; + + ra_start = fw_dump.reserve_dump_area_start; + ra_end = ra_start + fw_dump.reserve_dump_area_size; + + if ((ra_start < end) && (ra_end > start)) { + if ((start < ra_start) && (end > ra_end)) { + fadump_add_crash_memory(start, ra_start); + fadump_add_crash_memory(ra_end, end); + } else if (start < ra_start) { + fadump_add_crash_memory(start, ra_start); + } else if (ra_end < end) { + fadump_add_crash_memory(ra_end, end); + } + } else + fadump_add_crash_memory(start, end); +} + +static int fadump_init_elfcore_header(char *bufp) +{ + struct elfhdr *elf; + + elf = (struct elfhdr *) bufp; + bufp += sizeof(struct elfhdr); + memcpy(elf->e_ident, ELFMAG, SELFMAG); + elf->e_ident[EI_CLASS] = ELF_CLASS; + elf->e_ident[EI_DATA] = ELF_DATA; + elf->e_ident[EI_VERSION] = EV_CURRENT; + elf->e_ident[EI_OSABI] = ELF_OSABI; + memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); + elf->e_type = ET_CORE; + elf->e_machine = ELF_ARCH; + elf->e_version = EV_CURRENT; + elf->e_entry = 0; + elf->e_phoff = sizeof(struct elfhdr); + elf->e_shoff = 0; + elf->e_flags = ELF_CORE_EFLAGS; + elf->e_ehsize = sizeof(struct elfhdr); + elf->e_phentsize = sizeof(struct elf_phdr); + elf->e_phnum = 0; + elf->e_shentsize = 0; + elf->e_shnum = 0; + elf->e_shstrndx = 0; + + return 0; +} + +/* + * Traverse through lmb structure and setup crash memory ranges. These + * ranges will be used create PT_LOAD program headers in elfcore header. + */ +static void fadump_setup_crash_memory_ranges(void) +{ + struct lmb_property *reg; + unsigned long long start, end; + unsigned long iter; + + pr_debug("Setup crash memory ranges.\n"); + crash_mem_ranges = 0; + /* + * add the first memory chunk (RMA_START through boot_memory_size) as + * a separate memory chunk. The reason is, at the time crash firmware + * will move the content of this memory chunk to different location + * specified during fadump registration. We need to create a separate + * program header for this chunk with the correct offset. + */ + fadump_add_crash_memory(RMA_START, fw_dump.boot_memory_size); + + for_each_lmb(iter, memory, reg) { + start = (unsigned long long)reg->base; + end = start + (unsigned long long)reg->size; + if (start == RMA_START && end >= fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + + /* add this range excluding the reserved dump area. */ + fadump_exclude_reserved_area(start, end); + } +} + +/* + * If the given physical address falls within the boot memory region then + * return the relocated address that points to the dump region reserved + * for saving initial boot memory contents. + */ +static inline unsigned long fadump_relocate(unsigned long paddr) +{ + if (paddr > RMA_START && paddr < fw_dump.boot_memory_size) + return fdm.rmr_region.destination_address + paddr; + else + return paddr; +} + +static int fadump_create_elfcore_headers(char *bufp) +{ + struct elfhdr *elf; + struct elf_phdr *phdr; + int i; + + fadump_init_elfcore_header(bufp); + elf = (struct elfhdr *)bufp; + bufp += sizeof(struct elfhdr); + + /* + * setup ELF PT_NOTE, place holder for cpu notes info. The notes info + * will be populated during second kernel boot after crash. Hence + * this PT_NOTE will always be the first elf note. + * + * NOTE: Any new ELF note addition should be placed after this note. + */ + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_NOTE; + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + + phdr->p_offset = 0; + phdr->p_paddr = 0; + phdr->p_filesz = 0; + phdr->p_memsz = 0; + + (elf->e_phnum)++; + + /* setup ELF PT_NOTE for vmcoreinfo */ + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_NOTE; + phdr->p_flags = 0; + phdr->p_vaddr = 0; + phdr->p_align = 0; + + phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); + phdr->p_offset = phdr->p_paddr; + phdr->p_memsz = vmcoreinfo_max_size; + phdr->p_filesz = vmcoreinfo_max_size; + + /* Increment number of program headers. */ + (elf->e_phnum)++; + + /* setup PT_LOAD sections. */ + + for (i = 0; i < crash_mem_ranges; i++) { + unsigned long long mbase, msize; + mbase = crash_memory_ranges[i].base; + msize = crash_memory_ranges[i].size; + + if (!msize) + continue; + + phdr = (struct elf_phdr *)bufp; + bufp += sizeof(struct elf_phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mbase; + + if (mbase == RMA_START) { + /* + * The entire RMA region will be moved by firmware + * to the specified destination_address. Hence set + * the correct offset. + */ + phdr->p_offset = fdm.rmr_region.destination_address; + } + + phdr->p_paddr = mbase; + phdr->p_vaddr = (unsigned long)__va(mbase); + phdr->p_filesz = msize; + phdr->p_memsz = msize; + phdr->p_align = 0; + + /* Increment number of program headers. */ + (elf->e_phnum)++; + } + return 0; +} + +static unsigned long init_fadump_header(unsigned long addr) +{ + struct fadump_crash_info_header *fdh; + + if (!addr) + return 0; + + fw_dump.fadumphdr_addr = addr; + fdh = __va(addr); + addr += sizeof(struct fadump_crash_info_header); + + memset(fdh, 0, sizeof(struct fadump_crash_info_header)); + fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; + fdh->elfcorehdr_addr = addr; + /* We will set the crashing cpu id in crash_fadump() during crash. */ + fdh->crashing_cpu = CPU_UNKNOWN; + + return addr; +} + +static void register_fadump(void) +{ + unsigned long addr; + void *vaddr; + + /* + * If no memory is reserved then we can not register for firmware- + * assisted dump. + */ + if (!fw_dump.reserve_dump_area_size) + return; + + fadump_setup_crash_memory_ranges(); + + addr = fdm.rmr_region.destination_address + fdm.rmr_region.source_len; + /* Initialize fadump crash info header. */ + addr = init_fadump_header(addr); + vaddr = __va(addr); + + pr_debug("Creating ELF core headers at %#016lx\n", addr); + fadump_create_elfcore_headers(vaddr); + + /* register the future kernel dump with firmware. */ + register_fw_dump(&fdm); +} + +static int fadump_unregister_dump(struct fadump_mem_struct *fdm) +{ + int rc = 0; + unsigned int wait_time; + + pr_debug("Un-register firmware-assisted dump\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_UNREGISTER, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + printk(KERN_ERR "Failed to un-register firmware-assisted dump." + " unexpected error(%d).\n", rc); + return rc; + } + fw_dump.dump_registered = 0; + return 0; +} + +static int fadump_invalidate_dump(struct fadump_mem_struct *fdm) +{ + int rc = 0; + unsigned int wait_time; + + pr_debug("Invalidating firmware-assisted dump registration\n"); + + /* TODO: Add upper time limit for the delay */ + do { + rc = rtas_call(fw_dump.ibm_configure_kernel_dump, 3, 1, NULL, + FADUMP_INVALIDATE, fdm, + sizeof(struct fadump_mem_struct)); + + wait_time = rtas_busy_delay_time(rc); + if (wait_time) + mdelay(wait_time); + } while (wait_time); + + if (rc) { + printk(KERN_ERR "Failed to invalidate firmware-assisted dump " + "rgistration. unexpected error(%d).\n", rc); + return rc; + } + fw_dump.dump_active = 0; + fdm_active = NULL; + return 0; +} + +void fadump_cleanup(void) +{ + /* Invalidate the registration only if dump is active. */ + if (fw_dump.dump_active) { + init_fadump_mem_struct(&fdm, + fdm_active->cpu_state_data.destination_address); + fadump_invalidate_dump(&fdm); + } +} + +/* + * Release the memory that was reserved in early boot to preserve the memory + * contents. The released memory will be available for general use. + */ +static void fadump_release_memory(unsigned long begin, unsigned long end) +{ + unsigned long addr; + unsigned long ra_start, ra_end; + + ra_start = fw_dump.reserve_dump_area_start; + ra_end = ra_start + fw_dump.reserve_dump_area_size; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) + continue; + + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} + +static void fadump_invalidate_release_mem(void) +{ + unsigned long reserved_area_start, reserved_area_end; + unsigned long destination_address; + + mutex_lock(&fadump_mutex); + if (!fw_dump.dump_active) { + mutex_unlock(&fadump_mutex); + return; + } + + destination_address = fdm_active->cpu_state_data.destination_address; + fadump_cleanup(); + mutex_unlock(&fadump_mutex); + + /* + * Save the current reserved memory bounds we will require them + * later for releasing the memory for general use. + */ + reserved_area_start = fw_dump.reserve_dump_area_start; + reserved_area_end = reserved_area_start + + fw_dump.reserve_dump_area_size; + /* + * Setup reserve_dump_area_start and its size so that we can + * reuse this reserved memory for Re-registration. + */ + fw_dump.reserve_dump_area_start = destination_address; + fw_dump.reserve_dump_area_size = get_fadump_area_size(); + + fadump_release_memory(reserved_area_start, reserved_area_end); + if (fw_dump.cpu_notes_buf) { + fadump_cpu_notes_buf_free( + (unsigned long)__va(fw_dump.cpu_notes_buf), + fw_dump.cpu_notes_buf_size); + fw_dump.cpu_notes_buf = 0; + fw_dump.cpu_notes_buf_size = 0; + } + /* Initialize the kernel dump memory structure for FAD registration. */ + init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); +} + +static ssize_t fadump_release_memory_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!fw_dump.dump_active) + return -EPERM; + + if (buf[0] == '1') { + /* + * Take away the '/proc/vmcore'. We are releasing the dump + * memory, hence it will not be valid anymore. + */ + vmcore_cleanup(); + fadump_invalidate_release_mem(); + + } else + return -EINVAL; + return count; +} + +static ssize_t fadump_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", fw_dump.fadump_enabled); +} + +static ssize_t fadump_register_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", fw_dump.dump_registered); +} + +static ssize_t fadump_register_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret = 0; + + if (!fw_dump.fadump_enabled || fdm_active) + return -EPERM; + + mutex_lock(&fadump_mutex); + + switch (buf[0]) { + case '0': + if (fw_dump.dump_registered == 0) { + ret = -EINVAL; + goto unlock_out; + } + /* Un-register Firmware-assisted dump */ + fadump_unregister_dump(&fdm); + break; + case '1': + if (fw_dump.dump_registered == 1) { + ret = -EINVAL; + goto unlock_out; + } + /* Register Firmware-assisted dump */ + register_fadump(); + break; + default: + ret = -EINVAL; + break; + } + +unlock_out: + mutex_unlock(&fadump_mutex); + return ret < 0 ? ret : count; +} + +static int fadump_region_show(struct seq_file *m, void *private) +{ + const struct fadump_mem_struct *fdm_ptr; + + if (!fw_dump.fadump_enabled) + return 0; + + mutex_lock(&fadump_mutex); + if (fdm_active) + fdm_ptr = fdm_active; + else { + mutex_unlock(&fadump_mutex); + fdm_ptr = &fdm; + } + + seq_printf(m, + "CPU : [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->cpu_state_data.destination_address, + fdm_ptr->cpu_state_data.destination_address + + fdm_ptr->cpu_state_data.source_len - 1, + fdm_ptr->cpu_state_data.source_len, + fdm_ptr->cpu_state_data.bytes_dumped); + seq_printf(m, + "HPTE: [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->hpte_region.destination_address, + fdm_ptr->hpte_region.destination_address + + fdm_ptr->hpte_region.source_len - 1, + fdm_ptr->hpte_region.source_len, + fdm_ptr->hpte_region.bytes_dumped); + seq_printf(m, + "DUMP: [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + fdm_ptr->rmr_region.destination_address, + fdm_ptr->rmr_region.destination_address + + fdm_ptr->rmr_region.source_len - 1, + fdm_ptr->rmr_region.source_len, + fdm_ptr->rmr_region.bytes_dumped); + + if (!fdm_active || + (fw_dump.reserve_dump_area_start == + fdm_ptr->cpu_state_data.destination_address)) + goto out; + + /* Dump is active. Show reserved memory region. */ + seq_printf(m, + " : [%#016llx-%#016llx] %#llx bytes, " + "Dumped: %#llx\n", + (unsigned long long)fw_dump.reserve_dump_area_start, + fdm_ptr->cpu_state_data.destination_address - 1, + fdm_ptr->cpu_state_data.destination_address - + fw_dump.reserve_dump_area_start, + fdm_ptr->cpu_state_data.destination_address - + fw_dump.reserve_dump_area_start); +out: + if (fdm_active) + mutex_unlock(&fadump_mutex); + return 0; +} + +static struct kobj_attribute fadump_release_attr = __ATTR(fadump_release_mem, + 0200, NULL, + fadump_release_memory_store); +static struct kobj_attribute fadump_attr = __ATTR(fadump_enabled, + 0444, fadump_enabled_show, + NULL); +static struct kobj_attribute fadump_register_attr = __ATTR(fadump_registered, + 0644, fadump_register_show, + fadump_register_store); + +static int fadump_region_open(struct inode *inode, struct file *file) +{ + return single_open(file, fadump_region_show, inode->i_private); +} + +static const struct file_operations fadump_region_fops = { + .open = fadump_region_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void fadump_init_files(void) +{ + struct dentry *debugfs_file; + int rc = 0; + + rc = sysfs_create_file(kernel_kobj, &fadump_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_enabled (%d)\n", rc); + + rc = sysfs_create_file(kernel_kobj, &fadump_register_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_registered (%d)\n", rc); + + debugfs_file = debugfs_create_file("fadump_region", 0444, + powerpc_debugfs_root, NULL, + &fadump_region_fops); + if (!debugfs_file) + printk(KERN_ERR "fadump: unable to create debugfs file" + " fadump_region\n"); + + if (fw_dump.dump_active) { + rc = sysfs_create_file(kernel_kobj, &fadump_release_attr.attr); + if (rc) + printk(KERN_ERR "fadump: unable to create sysfs file" + " fadump_release_mem (%d)\n", rc); + } + return; +} + +/* + * Prepare for firmware-assisted dump. + */ +int __init setup_fadump(void) +{ + if (!fw_dump.fadump_enabled) + return 0; + + if (!fw_dump.fadump_supported) { + printk(KERN_ERR "Firmware-assisted dump is not supported on" + " this hardware\n"); + return 0; + } + + fadump_show_config(); + /* + * If dump data is available then see if it is valid and prepare for + * saving it to the disk. + */ + if (fw_dump.dump_active) { + /* + * if dump process fails then invalidate the registration + * and release memory before proceeding for re-registration. + */ + if (process_fadump(fdm_active) < 0) + fadump_invalidate_release_mem(); + } + /* Initialize the kernel dump memory structure for FAD registration. */ + else if (fw_dump.reserve_dump_area_size) + init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); + fadump_init_files(); + + return 1; +} +subsys_initcall(setup_fadump); diff -uprN linux-2.6.32/arch/powerpc/kernel/head_64.S linux-2.6.32.ovz/arch/powerpc/kernel/head_64.S --- linux-2.6.32/arch/powerpc/kernel/head_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/head_64.S 2015-03-10 13:24:04.000000000 -0700 @@ -478,6 +478,7 @@ _GLOBAL(copy_and_flush) sync addi r5,r5,8 addi r6,r6,8 + isync blr .align 8 @@ -563,15 +564,21 @@ __secondary_start: /* Set thread priority to MEDIUM */ HMT_MEDIUM - /* Do early setup for that CPU (stab, slb, hash table pointer) */ - bl .early_setup_secondary - /* Initialize the kernel stack. Just a repeat for iSeries. */ LOAD_REG_ADDR(r3, current_set) sldi r28,r24,3 /* get current_set[cpu#] */ - ldx r1,r3,r28 - addi r1,r1,THREAD_SIZE-STACK_FRAME_OVERHEAD - std r1,PACAKSAVE(r13) + ldx r14,r3,r28 + addi r14,r14,THREAD_SIZE-STACK_FRAME_OVERHEAD + std r14,PACAKSAVE(r13) + + /* Do early setup for that CPU (stab, slb, hash table pointer) */ + bl .early_setup_secondary + + /* + * setup the new stack pointer, but *don't* use this until + * translation is on. + */ + mr r1, r14 /* Clear backchain so we get nice backtraces */ li r7,0 @@ -608,6 +615,17 @@ _GLOBAL(start_secondary_prolog) std r3,0(r1) /* Zero the stack frame pointer */ bl .start_secondary b . +/* + * Reset stack pointer and call start_secondary + * to continue with online operation when woken up + * from cede in cpu offline. + */ +_GLOBAL(start_secondary_resume) + ld r1,PACAKSAVE(r13) /* Reload kernel stack pointer */ + li r3,0 + std r3,0(r1) /* Zero the stack frame pointer */ + bl .start_secondary + b . #endif /* diff -uprN linux-2.6.32/arch/powerpc/kernel/iommu.c linux-2.6.32.ovz/arch/powerpc/kernel/iommu.c --- linux-2.6.32/arch/powerpc/kernel/iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/iommu.c 2015-03-10 13:24:04.000000000 -0700 @@ -33,12 +33,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #define DBG(...) @@ -74,6 +76,26 @@ static int __init setup_iommu(char *str) __setup("protect4gb=", setup_protect4gb); __setup("iommu=", setup_iommu); +static DEFINE_PER_CPU(unsigned int, iommu_pool_hash); + +/* + * We precalculate the hash to avoid doing it on every allocation. + * + * The hash is important to spread CPUs across all the pools. For example, + * on a POWER7 with 4 way SMT we want interrupts on the primary threads and + * with 4 pools all primary threads would map to the same pool. + */ +static int __init setup_iommu_pool_hash(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS); + + return 0; +} +subsys_initcall(setup_iommu_pool_hash); + static unsigned long iommu_range_alloc(struct device *dev, struct iommu_table *tbl, unsigned long npages, @@ -87,6 +109,9 @@ static unsigned long iommu_range_alloc(s int pass = 0; unsigned long align_mask; unsigned long boundary_size; + unsigned long flags; + unsigned int pool_nr; + struct iommu_pool *pool; align_mask = 0xffffffffffffffffl >> (64 - align_order); @@ -99,36 +124,49 @@ static unsigned long iommu_range_alloc(s return DMA_ERROR_CODE; } - if (handle && *handle) - start = *handle; + /* + * We don't need to disable preemption here because any CPU can + * safely use any IOMMU pool. + */ + pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1); + + if (largealloc) + pool = &(tbl->large_pool); else - start = largealloc ? tbl->it_largehint : tbl->it_hint; + pool = &(tbl->pools[pool_nr]); - /* Use only half of the table for small allocs (15 pages or less) */ - limit = largealloc ? tbl->it_size : tbl->it_halfpoint; + spin_lock_irqsave(&(pool->lock), flags); - if (largealloc && start < tbl->it_halfpoint) - start = tbl->it_halfpoint; +again: + if ((pass == 0) && handle && *handle && + (*handle >= pool->start) && (*handle < pool->end)) + start = *handle; + else + start = pool->hint; + + limit = pool->end; /* The case below can happen if we have a small segment appended * to a large, or when the previous alloc was at the very end of * the available space. If so, go back to the initial start. */ if (start >= limit) - start = largealloc ? tbl->it_largehint : tbl->it_hint; - - again: + start = pool->start; if (limit + tbl->it_offset > mask) { limit = mask - tbl->it_offset + 1; /* If we're constrained on address range, first try * at the masked hint to avoid O(n) search complexity, - * but on second pass, start at 0. + * but on second pass, start at 0 in pool 0. */ - if ((start & mask) >= limit || pass > 0) - start = 0; - else + if ((start & mask) >= limit || pass > 0) { + spin_unlock(&(pool->lock)); + pool = &(tbl->pools[0]); + spin_lock(&(pool->lock)); + start = pool->start; + } else { start &= mask; + } } if (dev) @@ -142,16 +180,25 @@ static unsigned long iommu_range_alloc(s tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT, align_mask); if (n == -1) { - if (likely(pass < 2)) { - /* First failure, just rescan the half of the table. - * Second failure, rescan the other half of the table. - */ - start = (largealloc ^ pass) ? tbl->it_halfpoint : 0; - limit = pass ? tbl->it_size : limit; + if (likely(pass == 0)) { + /* First try the pool from the start */ + pool->hint = pool->start; + pass++; + goto again; + + } else if (pass <= tbl->nr_pools) { + /* Now try scanning all the other pools */ + spin_unlock(&(pool->lock)); + pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1); + pool = &tbl->pools[pool_nr]; + spin_lock(&(pool->lock)); + pool->hint = pool->start; pass++; goto again; + } else { - /* Third failure, give up */ + /* Give up */ + spin_unlock_irqrestore(&(pool->lock), flags); return DMA_ERROR_CODE; } } @@ -161,10 +208,10 @@ static unsigned long iommu_range_alloc(s /* Bump the hint to a new block for small allocs. */ if (largealloc) { /* Don't bump to new block to avoid fragmentation */ - tbl->it_largehint = end; + pool->hint = end; } else { /* Overflow will be taken care of at the next allocation */ - tbl->it_hint = (end + tbl->it_blocksize - 1) & + pool->hint = (end + tbl->it_blocksize - 1) & ~(tbl->it_blocksize - 1); } @@ -172,6 +219,8 @@ static unsigned long iommu_range_alloc(s if (handle) *handle = end; + spin_unlock_irqrestore(&(pool->lock), flags); + return n; } @@ -181,18 +230,14 @@ static dma_addr_t iommu_alloc(struct dev unsigned long mask, unsigned int align_order, struct dma_attrs *attrs) { - unsigned long entry, flags; + unsigned long entry; dma_addr_t ret = DMA_ERROR_CODE; int build_fail; - spin_lock_irqsave(&(tbl->it_lock), flags); - entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order); - if (unlikely(entry == DMA_ERROR_CODE)) { - spin_unlock_irqrestore(&(tbl->it_lock), flags); + if (unlikely(entry == DMA_ERROR_CODE)) return DMA_ERROR_CODE; - } entry += tbl->it_offset; /* Offset into real TCE table */ ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ @@ -209,8 +254,6 @@ static dma_addr_t iommu_alloc(struct dev */ if (unlikely(build_fail)) { __iommu_free(tbl, ret, npages); - - spin_unlock_irqrestore(&(tbl->it_lock), flags); return DMA_ERROR_CODE; } @@ -218,16 +261,14 @@ static dma_addr_t iommu_alloc(struct dev if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - spin_unlock_irqrestore(&(tbl->it_lock), flags); - /* Make sure updates are seen by hardware */ mb(); return ret; } -static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) { unsigned long entry, free_entry; @@ -247,20 +288,57 @@ static void __iommu_free(struct iommu_ta printk(KERN_INFO "\tindex = 0x%llx\n", (u64)tbl->it_index); WARN_ON(1); } - return; + + return false; } - ppc_md.tce_free(tbl, entry, npages); - iommu_area_free(tbl->it_map, free_entry, npages); + return true; } -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static struct iommu_pool *get_pool(struct iommu_table *tbl, + unsigned long entry) { + struct iommu_pool *p; + unsigned long largepool_start = tbl->large_pool.start; + + /* The large pool is the last pool at the top of the table */ + if (entry >= largepool_start) { + p = &tbl->large_pool; + } else { + unsigned int pool_nr = entry / tbl->poolsize; + + BUG_ON(pool_nr > tbl->nr_pools); + p = &tbl->pools[pool_nr]; + } + + return p; +} + +static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ + unsigned long entry, free_entry; unsigned long flags; + struct iommu_pool *pool; - spin_lock_irqsave(&(tbl->it_lock), flags); + entry = dma_addr >> IOMMU_PAGE_SHIFT; + free_entry = entry - tbl->it_offset; + + pool = get_pool(tbl, free_entry); + + if (!iommu_free_check(tbl, dma_addr, npages)) + return; + + ppc_md.tce_free(tbl, entry, npages); + + spin_lock_irqsave(&(pool->lock), flags); + bitmap_clear(tbl->it_map, free_entry, npages); + spin_unlock_irqrestore(&(pool->lock), flags); +} +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, + unsigned int npages) +{ __iommu_free(tbl, dma_addr, npages); /* Make sure TLB cache is flushed if the HW needs it. We do @@ -269,8 +347,6 @@ static void iommu_free(struct iommu_tabl */ if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - - spin_unlock_irqrestore(&(tbl->it_lock), flags); } int iommu_map_sg(struct device *dev, struct iommu_table *tbl, @@ -279,7 +355,6 @@ int iommu_map_sg(struct device *dev, str struct dma_attrs *attrs) { dma_addr_t dma_next = 0, dma_addr; - unsigned long flags; struct scatterlist *s, *outs, *segstart; int outcount, incount, i, build_fail = 0; unsigned int align; @@ -301,8 +376,6 @@ int iommu_map_sg(struct device *dev, str DBG("sg mapping %d elements:\n", nelems); - spin_lock_irqsave(&(tbl->it_lock), flags); - max_seg_size = dma_get_max_seg_size(dev); for_each_sg(sglist, s, nelems, i) { unsigned long vaddr, npages, entry, slen; @@ -384,8 +457,6 @@ int iommu_map_sg(struct device *dev, str if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - spin_unlock_irqrestore(&(tbl->it_lock), flags); - DBG("mapped %d elements:\n", outcount); /* For the sake of iommu_unmap_sg, we clear out the length in the @@ -417,7 +488,6 @@ int iommu_map_sg(struct device *dev, str if (s == outs) break; } - spin_unlock_irqrestore(&(tbl->it_lock), flags); return 0; } @@ -427,15 +497,12 @@ void iommu_unmap_sg(struct iommu_table * struct dma_attrs *attrs) { struct scatterlist *sg; - unsigned long flags; BUG_ON(direction == DMA_NONE); if (!tbl) return; - spin_lock_irqsave(&(tbl->it_lock), flags); - sg = sglist; while (nelems--) { unsigned int npages; @@ -455,13 +522,16 @@ void iommu_unmap_sg(struct iommu_table * */ if (ppc_md.tce_flush) ppc_md.tce_flush(tbl); - - spin_unlock_irqrestore(&(tbl->it_lock), flags); } static void iommu_table_clear(struct iommu_table *tbl) { - if (!is_kdump_kernel()) { + /* + * In case of firmware assisted dump system goes through clean + * reboot process at the time of system crash. Hence it's safe to + * clear the TCE entries if firmware assisted dump is active. + */ + if (!is_kdump_kernel() || is_fadump_active()) { /* Clear the table in case firmware left allocations in it */ ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size); return; @@ -504,22 +574,48 @@ struct iommu_table *iommu_init_table(str unsigned long sz; static int welcomed = 0; struct page *page; - - /* Set aside 1/4 of the table for large allocations. */ - tbl->it_halfpoint = tbl->it_size * 3 / 4; + unsigned int i; + struct iommu_pool *p; /* number of bytes needed for the bitmap */ sz = (tbl->it_size + 7) >> 3; - page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz)); + page = alloc_pages_node(nid, GFP_KERNEL, get_order(sz)); if (!page) panic("iommu_init_table: Can't allocate %ld bytes\n", sz); tbl->it_map = page_address(page); memset(tbl->it_map, 0, sz); - tbl->it_hint = 0; - tbl->it_largehint = tbl->it_halfpoint; - spin_lock_init(&tbl->it_lock); + /* + * Reserve page 0 so it will not be used for any mappings. + * This avoids buggy drivers that consider page 0 to be invalid + * to crash the machine or even lose data. + */ + if (tbl->it_offset == 0) + set_bit(0, tbl->it_map); + + /* We only split the IOMMU table if we have 1GB or more of space */ + if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024)) + tbl->nr_pools = IOMMU_NR_POOLS; + else + tbl->nr_pools = 1; + + /* We reserve the top 1/4 of the table for large allocations */ + tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools; + + for (i = 0; i < tbl->nr_pools; i++) { + p = &tbl->pools[i]; + spin_lock_init(&(p->lock)); + p->start = tbl->poolsize * i; + p->hint = p->start; + p->end = p->start + tbl->poolsize; + } + + p = &tbl->large_pool; + spin_lock_init(&(p->lock)); + p->start = tbl->poolsize * i; + p->hint = p->start; + p->end = tbl->it_size; iommu_table_clear(tbl); diff -uprN linux-2.6.32/arch/powerpc/kernel/irq.c linux-2.6.32.ovz/arch/powerpc/kernel/irq.c --- linux-2.6.32/arch/powerpc/kernel/irq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/irq.c 2015-03-10 13:21:27.000000000 -0700 @@ -53,7 +53,6 @@ #include #include #include -#include #include #include @@ -138,11 +137,6 @@ notrace void raw_local_irq_restore(unsig } #endif /* CONFIG_PPC_STD_MMU_64 */ - if (test_perf_event_pending()) { - clear_perf_event_pending(); - perf_event_do_pending(); - } - /* * if (get_paca()->hard_enabled) return; * But again we need to take care that gcc gets hard_enabled directly diff -uprN linux-2.6.32/arch/powerpc/kernel/lparcfg.c linux-2.6.32.ovz/arch/powerpc/kernel/lparcfg.c --- linux-2.6.32/arch/powerpc/kernel/lparcfg.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/lparcfg.c 2015-03-10 13:22:22.000000000 -0700 @@ -37,7 +37,7 @@ #include #include -#define MODULE_VERS "1.8" +#define MODULE_VERS "1.9" #define MODULE_NAME "lparcfg" /* #define LPARCFG_DEBUG */ @@ -131,34 +131,6 @@ static int iseries_lparcfg_data(struct s /* * Methods used to fetch LPAR data when running on a pSeries platform. */ -/** - * h_get_mpp - * H_GET_MPP hcall returns info in 7 parms - */ -int h_get_mpp(struct hvcall_mpp_data *mpp_data) -{ - int rc; - unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; - - rc = plpar_hcall9(H_GET_MPP, retbuf); - - mpp_data->entitled_mem = retbuf[0]; - mpp_data->mapped_mem = retbuf[1]; - - mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; - mpp_data->pool_num = retbuf[2] & 0xffff; - - mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; - mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; - mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff; - - mpp_data->pool_size = retbuf[4]; - mpp_data->loan_request = retbuf[5]; - mpp_data->backing_mem = retbuf[6]; - - return rc; -} -EXPORT_SYMBOL(h_get_mpp); struct hvcall_ppp_data { u64 entitlement; @@ -344,6 +316,30 @@ static void parse_mpp_data(struct seq_fi seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem); } +/** + * parse_mpp_x_data + * Parse out data returned from h_get_mpp_x + */ +static void parse_mpp_x_data(struct seq_file *m) +{ + struct hvcall_mpp_x_data mpp_x_data; + + if (!firmware_has_feature(FW_FEATURE_XCMO)) + return; + if (h_get_mpp_x(&mpp_x_data)) + return; + + seq_printf(m, "coalesced_bytes=%ld\n", mpp_x_data.coalesced_bytes); + + if (mpp_x_data.pool_coalesced_bytes) + seq_printf(m, "pool_coalesced_bytes=%ld\n", + mpp_x_data.pool_coalesced_bytes); + if (mpp_x_data.pool_purr_cycles) + seq_printf(m, "coalesce_pool_purr=%ld\n", mpp_x_data.pool_purr_cycles); + if (mpp_x_data.pool_spurr_cycles) + seq_printf(m, "coalesce_pool_spurr=%ld\n", mpp_x_data.pool_spurr_cycles); +} + #define SPLPAR_CHARACTERISTICS_TOKEN 20 #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) @@ -486,6 +482,14 @@ static void splpar_dispatch_data(struct seq_printf(m, "dispatch_dispersions=%lu\n", dispatch_dispersions); } +static void parse_em_data(struct seq_file *m) +{ + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + if (plpar_hcall(H_GET_EM_PARMS, retbuf) == H_SUCCESS) + seq_printf(m, "power_mode_data=%016lx\n", retbuf[0]); +} + static int pseries_lparcfg_data(struct seq_file *m, void *v) { int partition_potential_processors; @@ -511,6 +515,7 @@ static int pseries_lparcfg_data(struct s parse_system_parameter_string(m); parse_ppp_data(m); parse_mpp_data(m); + parse_mpp_x_data(m); pseries_cmo_data(m); splpar_dispatch_data(m); @@ -540,6 +545,8 @@ static int pseries_lparcfg_data(struct s seq_printf(m, "slb_size=%d\n", mmu_slb_size); + parse_em_data(m); + return 0; } diff -uprN linux-2.6.32/arch/powerpc/kernel/machine_kexec_64.c linux-2.6.32.ovz/arch/powerpc/kernel/machine_kexec_64.c --- linux-2.6.32/arch/powerpc/kernel/machine_kexec_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/machine_kexec_64.c 2015-03-10 13:22:16.000000000 -0700 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -155,33 +156,41 @@ void kexec_copy_flush(struct kimage *ima #ifdef CONFIG_SMP -/* FIXME: we should schedule this function to be called on all cpus based - * on calling the interrupts, but we would like to call it off irq level - * so that the interrupt controller is clean. - */ +static u8 kexec_all_irq_disabled; +u8 kexec_state[NR_CPUS] = {0}; + static void kexec_smp_down(void *arg) { + int my_cpu = get_cpu(); + + local_irq_disable(); + mb(); /* make sure our irqs are disabled before we say they are */ + kexec_state[my_cpu] = KEXEC_STATE_IRQS_OFF; + while(kexec_all_irq_disabled == 0) + cpu_relax(); + mb(); /* make sure all irqs are disabled before this */ + /* + * Now every CPU has IRQs off, we can clear out any pending + * IPIs and be sure that no more will come in after this. + */ if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(0, 1); - local_irq_disable(); kexec_smp_wait(); /* NOTREACHED */ } -static void kexec_prepare_cpus(void) +static void kexec_prepare_cpus_wait(int wait_state) { int my_cpu, i, notified=-1; - smp_call_function(kexec_smp_down, NULL, /* wait */0); my_cpu = get_cpu(); - - /* check the others cpus are now down (via paca hw cpu id == -1) */ + /* Make sure each CPU has at least made it to the state we need */ for (i=0; i < NR_CPUS; i++) { if (i == my_cpu) continue; - while (paca[i].hw_cpu_id != -1) { + while (kexec_state[i] < wait_state) { barrier(); if (!cpu_possible(i)) { printk("kexec: cpu %d hw_cpu_id %d is not" @@ -201,20 +210,61 @@ static void kexec_prepare_cpus(void) } if (i != notified) { printk( "kexec: waiting for cpu %d (physical" - " %d) to go down\n", - i, paca[i].hw_cpu_id); + " %d) to enter %i state\n", + i, paca[i].hw_cpu_id, wait_state); notified = i; } } } + mb(); +} + +/* + * We need to make sure each present CPU is online. The next kernel will scan + * the device tree and assume primary threads are online and query secondry + * threads via RTAS to online them if required. If we don't online primary + * threads, they will be stuck. However, we also online secondary threads as we + * may be using 'cede offline'. In this case RTAS doesn't see the secondary + * threads as offline -- and again, these CPUs will be stuck. + * + * So, we online all CPUs that should be running, including secondary threads. + */ +static void wake_offline_cpus(void) +{ + int cpu = 0; + + for_each_present_cpu(cpu) { + if (!cpu_online(cpu)) { + printk(KERN_INFO "kexec: Waking offline cpu %d.\n", + cpu); + cpu_up(cpu); + } + } +} + +static void kexec_prepare_cpus(void) +{ + int my_cpu = get_cpu(); + + wake_offline_cpus(); + smp_call_function(kexec_smp_down, NULL, /* wait */0); + local_irq_disable(); + mb(); /* make sure IRQs are disabled before we say they are */ + kexec_state[my_cpu] = KEXEC_STATE_IRQS_OFF; + + kexec_prepare_cpus_wait(KEXEC_STATE_IRQS_OFF); + /* we are sure every CPU has IRQs off at this point */ + kexec_all_irq_disabled = 1; /* after we tell the others to go down */ if (ppc_md.kexec_cpu_down) ppc_md.kexec_cpu_down(0, 0); - put_cpu(); + /* Before removing MMU mapings, + * make sure all CPUs have entered real mode */ + kexec_prepare_cpus_wait(KEXEC_STATE_REAL_MODE); - local_irq_disable(); + put_cpu(); } #else /* ! SMP */ diff -uprN linux-2.6.32/arch/powerpc/kernel/machine_kexec.c linux-2.6.32.ovz/arch/powerpc/kernel/machine_kexec.c --- linux-2.6.32/arch/powerpc/kernel/machine_kexec.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/machine_kexec.c 2015-06-23 04:16:16.000000000 -0700 @@ -45,6 +45,18 @@ void machine_kexec_cleanup(struct kimage ppc_md.machine_kexec_cleanup(image); } +void arch_crash_save_vmcoreinfo(void) +{ + +#ifdef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(node_data); + VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); +#endif +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(contig_page_data); +#endif +} + /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. @@ -61,6 +73,34 @@ void machine_kexec(struct kimage *image) for(;;); } +#ifdef CONFIG_KEXEC_AUTO_RESERVE +unsigned long long __init arch_default_crash_base(void) +{ +#ifndef CONFIG_RELOCATABLE + return KDUMP_KERNELBASE; +#else + return 0; +#endif +} + +unsigned long long __init arch_default_crash_size(unsigned long long total_size) +{ + if (total_size < KEXEC_AUTO_THRESHOLD) + return 0; + if (total_size < (1ULL<<32)) + return 1ULL<<27; + else { +#ifdef CONFIG_64BIT + if (total_size > (1ULL<<37)) /* 128G */ + return 1ULL<<32; /* 4G */ + return 1ULL< 0) { crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; @@ -98,7 +138,7 @@ void __init reserve_crashkernel(void) /* * unspecified address, choose a region of specified size * can overlap with initrd (ignoring corruption when retained) - * ppc64 requires kernel and some stacks to be in first segemnt + * ppc64 requires kernel and some stacks to be in first segment */ crashk_res.start = KDUMP_KERNELBASE; } @@ -109,24 +149,42 @@ void __init reserve_crashkernel(void) PAGE_SIZE); crashk_res.start = crash_base; } - #endif crash_size = PAGE_ALIGN(crash_size); crashk_res.end = crashk_res.start + crash_size - 1; /* The crash region must not overlap the current kernel */ if (overlaps_crashkernel(__pa(_stext), _end - _stext)) { +#ifdef CONFIG_RELOCATABLE + do { + /* Align kdump kernel to 16MB (size of large page) */ + crashk_res.start = ALIGN(crashk_res.start + + (16 * 1024 * 1024), 0x1000000); + if (crashk_res.start + (_end - _stext) > lmb.rmo_size) { + printk(KERN_WARNING + "Not enough memory for crash kernel\n"); + crashk_res.start = crashk_res.end = 0; + return; + } + } while (overlaps_crashkernel(__pa(_stext), _end - _stext)); + + crashk_res.end = crashk_res.start + crash_size - 1; + printk(KERN_INFO + "crash kernel memory overlaps with kernel memory\n" + "Moving it to %ldMB\n", (unsigned long)(crashk_res.start >> 20)); +#else printk(KERN_WARNING "Crash kernel can not overlap current kernel\n"); crashk_res.start = crashk_res.end = 0; return; +#endif } /* Crash kernel trumps memory limit */ if (memory_limit && memory_limit <= crashk_res.end) { memory_limit = crashk_res.end + 1; printk("Adjusted memory limit for crashkernel, now 0x%llx\n", - (unsigned long long)memory_limit); + memory_limit); } printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " @@ -165,6 +223,12 @@ static struct property crashk_size_prop .value = &crashk_size, }; +static struct property memory_limit_prop = { + .name = "linux,memory-limit", + .length = sizeof(unsigned long long), + .value = &memory_limit, +}; + static void __init export_crashk_values(struct device_node *node) { struct property *prop; @@ -184,6 +248,16 @@ static void __init export_crashk_values( crashk_size = crashk_res.end - crashk_res.start + 1; prom_add_property(node, &crashk_size_prop); } + + /* + * memory_limit is required by the kexec-tools to limit the + * crash regions to the actual memory used. + */ + prop = of_find_property(node, memory_limit_prop.name, NULL); + if (!prop) + prom_add_property(node, &memory_limit_prop); + else + prom_update_property(node, &memory_limit_prop, prop); } static int __init kexec_setup(void) diff -uprN linux-2.6.32/arch/powerpc/kernel/Makefile linux-2.6.32.ovz/arch/powerpc/kernel/Makefile --- linux-2.6.32/arch/powerpc/kernel/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/Makefile 2015-03-10 13:24:00.000000000 -0700 @@ -53,6 +53,7 @@ obj-$(CONFIG_IBMVIO) += vio.o obj-$(CONFIG_IBMEBUS) += ibmebus.o obj-$(CONFIG_GENERIC_TBSYNC) += smp-tbsync.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_FA_DUMP) += fadump.o obj-$(CONFIG_E500) += idle_e500.o obj-$(CONFIG_6xx) += idle_6xx.o l2cr_6xx.o cpu_setup_6xx.o obj-$(CONFIG_TAU) += tau_6xx.o @@ -99,7 +100,8 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_PPC_PERF_CTRS) += perf_event.o perf_callchain.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ - power5+-pmu.o power6-pmu.o power7-pmu.o + power5+-pmu.o power6-pmu.o power7-pmu.o \ + power8-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff -uprN linux-2.6.32/arch/powerpc/kernel/misc_64.S linux-2.6.32.ovz/arch/powerpc/kernel/misc_64.S --- linux-2.6.32/arch/powerpc/kernel/misc_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/misc_64.S 2015-03-10 13:24:13.000000000 -0700 @@ -24,6 +24,7 @@ #include #include #include +#include .text @@ -471,6 +472,13 @@ _GLOBAL(kexec_wait) 1: mflr r5 addi r5,r5,kexec_flag-1b + /* set kexec_state for this CPU to real mode */ + /* r3 already contains cpuid here */ + LOAD_REG_ADDR(r4, kexec_state) + li r9,KEXEC_STATE_REAL_MODE + stbx r9, r4, r6 + SYNC + 99: HMT_LOW #ifdef CONFIG_KEXEC /* use no memory without kexec */ lwz r4,0(r5) @@ -494,14 +502,12 @@ kexec_flag: * note: this is a terminal routine, it does not save lr * * get phys id from paca - * set paca id to -1 to say we got here * switch to real mode * join other cpus in kexec_wait(phys_id) */ _GLOBAL(kexec_smp_wait) lhz r3,PACAHWCPUID(r13) - li r4,-1 - sth r4,PACAHWCPUID(r13) /* let others know we left */ + lhz r6,PACAPACAINDEX(r13) bl real_mode b .kexec_wait diff -uprN linux-2.6.32/arch/powerpc/kernel/mpc7450-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/mpc7450-pmu.c --- linux-2.6.32/arch/powerpc/kernel/mpc7450-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/mpc7450-pmu.c 2015-03-10 13:22:06.000000000 -0700 @@ -414,4 +414,4 @@ static int init_mpc7450_pmu(void) return register_power_pmu(&mpc7450_pmu); } -arch_initcall(init_mpc7450_pmu); +early_initcall(init_mpc7450_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/nvram_64.c linux-2.6.32.ovz/arch/powerpc/kernel/nvram_64.c --- linux-2.6.32/arch/powerpc/kernel/nvram_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/nvram_64.c 2015-03-10 13:22:27.000000000 -0700 @@ -31,17 +31,11 @@ #include #include #include +#include #undef DEBUG_NVRAM -static struct nvram_partition * nvram_part; -static long nvram_error_log_index = -1; -static long nvram_error_log_size = 0; - -struct err_log_info { - int error_type; - unsigned int seq_num; -}; +static LIST_HEAD(nvram_partitions); static loff_t dev_nvram_llseek(struct file *file, loff_t offset, int origin) { @@ -139,8 +133,8 @@ out: } -static int dev_nvram_ioctl(struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) +static long dev_nvram_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) { switch(cmd) { #ifdef CONFIG_PPC_PMAC @@ -169,11 +163,11 @@ static int dev_nvram_ioctl(struct inode } const struct file_operations nvram_fops = { - .owner = THIS_MODULE, - .llseek = dev_nvram_llseek, - .read = dev_nvram_read, - .write = dev_nvram_write, - .ioctl = dev_nvram_ioctl, + .owner = THIS_MODULE, + .llseek = dev_nvram_llseek, + .read = dev_nvram_read, + .write = dev_nvram_write, + .unlocked_ioctl = dev_nvram_ioctl, }; static struct miscdevice nvram_dev = { @@ -184,16 +178,14 @@ static struct miscdevice nvram_dev = { #ifdef DEBUG_NVRAM -static void nvram_print_partitions(char * label) +static void __init nvram_print_partitions(char * label) { - struct list_head * p; struct nvram_partition * tmp_part; printk(KERN_WARNING "--------%s---------\n", label); printk(KERN_WARNING "indx\t\tsig\tchks\tlen\tname\n"); - list_for_each(p, &nvram_part->partition) { - tmp_part = list_entry(p, struct nvram_partition, partition); - printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%s\n", + list_for_each_entry(tmp_part, &nvram_partitions, partition) { + printk(KERN_WARNING "%4d \t%02x\t%02x\t%d\t%.12s\n", tmp_part->index, tmp_part->header.signature, tmp_part->header.checksum, tmp_part->header.length, tmp_part->header.name); @@ -202,7 +194,7 @@ static void nvram_print_partitions(char #endif -static int nvram_write_header(struct nvram_partition * part) +static int __init nvram_write_header(struct nvram_partition * part) { loff_t tmp_index; int rc; @@ -214,7 +206,7 @@ static int nvram_write_header(struct nvr } -static unsigned char nvram_checksum(struct nvram_header *p) +static unsigned char __init nvram_checksum(struct nvram_header *p) { unsigned int c_sum, c_sum2; unsigned short *sp = (unsigned short *)p->name; /* assume 6 shorts */ @@ -233,115 +225,155 @@ static unsigned char nvram_checksum(stru * Find an nvram partition, sig can be 0 for any * partition or name can be NULL for any name, else * tries to match both + * + * Note: nvram_find_partition2() is used internally, but nvram_find_partition() + * is exported, so we keep it to preserve the kABI. */ struct nvram_partition *nvram_find_partition(int sig, const char *name) { struct nvram_partition * part; - struct list_head * p; - - list_for_each(p, &nvram_part->partition) { - part = list_entry(p, struct nvram_partition, partition); + list_for_each_entry(part, &nvram_partitions, partition) { if (sig && part->header.signature != sig) continue; if (name && 0 != strncmp(name, part->header.name, 12)) continue; - return part; + return part; } return NULL; } EXPORT_SYMBOL(nvram_find_partition); -static int nvram_remove_os_partition(void) +/* + * Per the criteria passed via nvram_remove_partition(), should this + * partition be removed? 1=remove, 0=keep + */ +static int nvram_can_remove_partition(struct nvram_partition *part, + const char *name, int sig, const char *const exceptions[]) { - struct list_head *i; - struct list_head *j; - struct nvram_partition * part; - struct nvram_partition * cur_part; + if (part->header.signature != sig) + return 0; + if (name) { + if (strncmp(name, part->header.name, 12)) + return 0; + } else if (exceptions) { + const char *const *except; + for (except = exceptions; *except; except++) { + if (!strncmp(*except, part->header.name, 12)) + return 0; + } + } + return 1; +} + +/** + * nvram_remove_partition - Remove one or more partitions in nvram + * @name: name of the partition to remove, or NULL for a + * signature only match + * @sig: signature of the partition(s) to remove + * @exceptions: When removing all partitions with a matching signature, + * leave these alone. + */ + +int __init nvram_remove_partition(const char *name, int sig, + const char *const exceptions[]) +{ + struct nvram_partition *part, *prev, *tmp; int rc; - list_for_each(i, &nvram_part->partition) { - part = list_entry(i, struct nvram_partition, partition); - if (part->header.signature != NVRAM_SIG_OS) + list_for_each_entry(part, &nvram_partitions, partition) { + if (!nvram_can_remove_partition(part, name, sig, exceptions)) continue; - - /* Make os partition a free partition */ + + /* Make partition a free partition */ part->header.signature = NVRAM_SIG_FREE; - sprintf(part->header.name, "wwwwwwwwwwww"); + memcpy(part->header.name, "wwwwwwwwwwww", 12); part->header.checksum = nvram_checksum(&part->header); - - /* Merge contiguous free partitions backwards */ - list_for_each_prev(j, &part->partition) { - cur_part = list_entry(j, struct nvram_partition, partition); - if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) { - break; - } - - part->header.length += cur_part->header.length; - part->header.checksum = nvram_checksum(&part->header); - part->index = cur_part->index; - - list_del(&cur_part->partition); - kfree(cur_part); - j = &part->partition; /* fixup our loop */ - } - - /* Merge contiguous free partitions forwards */ - list_for_each(j, &part->partition) { - cur_part = list_entry(j, struct nvram_partition, partition); - if (cur_part == nvram_part || cur_part->header.signature != NVRAM_SIG_FREE) { - break; - } - - part->header.length += cur_part->header.length; - part->header.checksum = nvram_checksum(&part->header); - - list_del(&cur_part->partition); - kfree(cur_part); - j = &part->partition; /* fixup our loop */ - } - rc = nvram_write_header(part); if (rc <= 0) { - printk(KERN_ERR "nvram_remove_os_partition: nvram_write failed (%d)\n", rc); + printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc); return rc; } + } + /* Merge contiguous ones */ + prev = NULL; + list_for_each_entry_safe(part, tmp, &nvram_partitions, partition) { + if (part->header.signature != NVRAM_SIG_FREE) { + prev = NULL; + continue; + } + if (prev) { + prev->header.length += part->header.length; + prev->header.checksum = nvram_checksum(&part->header); + rc = nvram_write_header(part); + if (rc <= 0) { + printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc); + return rc; + } + list_del(&part->partition); + kfree(part); + } else + prev = part; } return 0; } -/* nvram_create_os_partition +/** + * nvram_create_partition - Create a partition in nvram + * @name: name of the partition to create + * @sig: signature of the partition to create + * @req_size: size of data to allocate in bytes + * @min_size: minimum acceptable size (0 means req_size) * - * Create a OS linux partition to buffer error logs. - * Will create a partition starting at the first free - * space found if space has enough room. + * Returns a negative error code or a positive nvram index + * of the beginning of the data area of the newly created + * partition. If you provided a min_size smaller than req_size + * you need to query for the actual size yourself after the + * call using nvram_partition_get_size(). */ -static int nvram_create_os_partition(void) +loff_t __init nvram_create_partition(const char *name, int sig, + int req_size, int min_size) { struct nvram_partition *part; struct nvram_partition *new_part; struct nvram_partition *free_part = NULL; - int seq_init[2] = { 0, 0 }; loff_t tmp_index; long size = 0; int rc; - + + /* Convert sizes from bytes to blocks */ + req_size = _ALIGN_UP(req_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; + min_size = _ALIGN_UP(min_size, NVRAM_BLOCK_LEN) / NVRAM_BLOCK_LEN; + + /* If no minimum size specified, make it the same as the + * requested size + */ + if (min_size == 0) + min_size = req_size; + if (min_size > req_size) + return -EINVAL; + + /* Now add one block to each for the header */ + req_size += 1; + min_size += 1; + /* Find a free partition that will give us the maximum needed size If can't find one that will give us the minimum size needed */ - list_for_each_entry(part, &nvram_part->partition, partition) { + list_for_each_entry(part, &nvram_partitions, partition) { if (part->header.signature != NVRAM_SIG_FREE) continue; - if (part->header.length >= NVRAM_MAX_REQ) { - size = NVRAM_MAX_REQ; + if (part->header.length >= req_size) { + size = req_size; free_part = part; break; } - if (!size && part->header.length >= NVRAM_MIN_REQ) { - size = NVRAM_MIN_REQ; + if (part->header.length > size && + part->header.length >= min_size) { + size = part->header.length; free_part = part; } } @@ -351,136 +383,96 @@ static int nvram_create_os_partition(voi /* Create our OS partition */ new_part = kmalloc(sizeof(*new_part), GFP_KERNEL); if (!new_part) { - printk(KERN_ERR "nvram_create_os_partition: kmalloc failed\n"); + pr_err("nvram_create_os_partition: kmalloc failed\n"); return -ENOMEM; } new_part->index = free_part->index; - new_part->header.signature = NVRAM_SIG_OS; + new_part->header.signature = sig; new_part->header.length = size; - strcpy(new_part->header.name, "ppc64,linux"); + strncpy(new_part->header.name, name, 12); new_part->header.checksum = nvram_checksum(&new_part->header); rc = nvram_write_header(new_part); if (rc <= 0) { - printk(KERN_ERR "nvram_create_os_partition: nvram_write_header \ - failed (%d)\n", rc); - return rc; - } - - /* make sure and initialize to zero the sequence number and the error - type logged */ - tmp_index = new_part->index + NVRAM_HEADER_LEN; - rc = ppc_md.nvram_write((char *)&seq_init, sizeof(seq_init), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_create_os_partition: nvram_write " - "failed (%d)\n", rc); + pr_err("nvram_create_os_partition: nvram_write_header " + "failed (%d)\n", rc); return rc; } - - nvram_error_log_index = new_part->index + NVRAM_HEADER_LEN; - nvram_error_log_size = ((part->header.length - 1) * - NVRAM_BLOCK_LEN) - sizeof(struct err_log_info); - list_add_tail(&new_part->partition, &free_part->partition); - if (free_part->header.length <= size) { + /* Adjust or remove the partition we stole the space from */ + if (free_part->header.length > size) { + free_part->index += size * NVRAM_BLOCK_LEN; + free_part->header.length -= size; + free_part->header.checksum = nvram_checksum(&free_part->header); + rc = nvram_write_header(free_part); + if (rc <= 0) { + pr_err("nvram_create_os_partition: nvram_write_header " + "failed (%d)\n", rc); + return rc; + } + } else { list_del(&free_part->partition); kfree(free_part); - return 0; } - /* Adjust the partition we stole the space from */ - free_part->index += size * NVRAM_BLOCK_LEN; - free_part->header.length -= size; - free_part->header.checksum = nvram_checksum(&free_part->header); - - rc = nvram_write_header(free_part); - if (rc <= 0) { - printk(KERN_ERR "nvram_create_os_partition: nvram_write_header " - "failed (%d)\n", rc); - return rc; + /* Clear the new partition */ + for (tmp_index = new_part->index + NVRAM_HEADER_LEN; + tmp_index < ((size - 1) * NVRAM_BLOCK_LEN); + tmp_index += NVRAM_BLOCK_LEN) { + rc = ppc_md.nvram_write((char*) empty_zero_page, + NVRAM_BLOCK_LEN, &tmp_index); + if (rc <= 0) { + pr_err("nvram_create_partition: nvram_write failed (%d)\n", rc); + return rc; + } } - - return 0; + + return new_part->index + NVRAM_HEADER_LEN; } - -/* nvram_setup_partition - * - * This will setup the partition we need for buffering the - * error logs and cleanup partitions if needed. - * - * The general strategy is the following: - * 1.) If there is ppc64,linux partition large enough then use it. - * 2.) If there is not a ppc64,linux partition large enough, search - * for a free partition that is large enough. - * 3.) If there is not a free partition large enough remove - * _all_ OS partitions and consolidate the space. - * 4.) Will first try getting a chunk that will satisfy the maximum - * error log size (NVRAM_MAX_REQ). - * 5.) If the max chunk cannot be allocated then try finding a chunk - * that will satisfy the minum needed (NVRAM_MIN_REQ). +/** + * nvram_get_partition_size - Get the data size of an nvram partition + * @data_index: This is the offset of the start of the data of + * the partition. The same value that is returned by + * nvram_create_partition(). */ -static int nvram_setup_partition(void) +int nvram_get_partition_size(loff_t data_index) { - struct list_head * p; - struct nvram_partition * part; - int rc; - - /* For now, we don't do any of this on pmac, until I - * have figured out if it's worth killing some unused stuffs - * in our nvram, as Apple defined partitions use pretty much - * all of the space - */ - if (machine_is(powermac)) - return -ENOSPC; + struct nvram_partition *part; + + list_for_each_entry(part, &nvram_partitions, partition) { + if (part->index + NVRAM_HEADER_LEN == data_index) + return (part->header.length - 1) * NVRAM_BLOCK_LEN; + } + return -1; +} - /* see if we have an OS partition that meets our needs. - will try getting the max we need. If not we'll delete - partitions and try again. */ - list_for_each(p, &nvram_part->partition) { - part = list_entry(p, struct nvram_partition, partition); - if (part->header.signature != NVRAM_SIG_OS) - continue; - if (strcmp(part->header.name, "ppc64,linux")) - continue; +/** + * nvram_find_partition2 - Find an nvram partition by signature and name + * @name: Name of the partition or NULL for any name + * @sig: Signature to test against + * @out_size: if non-NULL, returns the size of the data part of the partition + */ +loff_t nvram_find_partition2(const char *name, int sig, int *out_size) +{ + struct nvram_partition *p; - if (part->header.length >= NVRAM_MIN_REQ) { - /* found our partition */ - nvram_error_log_index = part->index + NVRAM_HEADER_LEN; - nvram_error_log_size = ((part->header.length - 1) * - NVRAM_BLOCK_LEN) - sizeof(struct err_log_info); - return 0; + list_for_each_entry(p, &nvram_partitions, partition) { + if (p->header.signature == sig && + (!name || !strncmp(p->header.name, name, 12))) { + if (out_size) + *out_size = (p->header.length - 1) * + NVRAM_BLOCK_LEN; + return p->index + NVRAM_HEADER_LEN; } } - - /* try creating a partition with the free space we have */ - rc = nvram_create_os_partition(); - if (!rc) { - return 0; - } - - /* need to free up some space */ - rc = nvram_remove_os_partition(); - if (rc) { - return rc; - } - - /* create a partition in this new space */ - rc = nvram_create_os_partition(); - if (rc) { - printk(KERN_ERR "nvram_create_os_partition: Could not find a " - "NVRAM partition large enough\n"); - return rc; - } - return 0; } - -static int nvram_scan_partitions(void) +int __init nvram_scan_partitions(void) { loff_t cur_index = 0; struct nvram_header phead; @@ -490,7 +482,7 @@ static int nvram_scan_partitions(void) int total_size; int err; - if (ppc_md.nvram_size == NULL) + if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0) return -ENODEV; total_size = ppc_md.nvram_size(); @@ -537,12 +529,16 @@ static int nvram_scan_partitions(void) memcpy(&tmp_part->header, &phead, NVRAM_HEADER_LEN); tmp_part->index = cur_index; - list_add_tail(&tmp_part->partition, &nvram_part->partition); + list_add_tail(&tmp_part->partition, &nvram_partitions); cur_index += phead.length * NVRAM_BLOCK_LEN; } err = 0; +#ifdef DEBUG_NVRAM + nvram_print_partitions("NVRAM Partitions"); +#endif + out: kfree(header); return err; @@ -550,9 +546,10 @@ static int nvram_scan_partitions(void) static int __init nvram_init(void) { - int error; int rc; + BUILD_BUG_ON(NVRAM_BLOCK_LEN != 16); + if (ppc_md.nvram_size == NULL || ppc_md.nvram_size() <= 0) return -ENODEV; @@ -562,29 +559,6 @@ static int __init nvram_init(void) return rc; } - /* initialize our anchor for the nvram partition list */ - nvram_part = kmalloc(sizeof(struct nvram_partition), GFP_KERNEL); - if (!nvram_part) { - printk(KERN_ERR "nvram_init: Failed kmalloc\n"); - return -ENOMEM; - } - INIT_LIST_HEAD(&nvram_part->partition); - - /* Get all the NVRAM partitions */ - error = nvram_scan_partitions(); - if (error) { - printk(KERN_ERR "nvram_init: Failed nvram_scan_partitions\n"); - return error; - } - - if(nvram_setup_partition()) - printk(KERN_WARNING "nvram_init: Could not find nvram partition" - " for nvram buffered error logging.\n"); - -#ifdef DEBUG_NVRAM - nvram_print_partitions("NVRAM Partitions"); -#endif - return rc; } @@ -593,132 +567,6 @@ void __exit nvram_cleanup(void) misc_deregister( &nvram_dev ); } - -#ifdef CONFIG_PPC_PSERIES - -/* nvram_write_error_log - * - * We need to buffer the error logs into nvram to ensure that we have - * the failure information to decode. If we have a severe error there - * is no way to guarantee that the OS or the machine is in a state to - * get back to user land and write the error to disk. For example if - * the SCSI device driver causes a Machine Check by writing to a bad - * IO address, there is no way of guaranteeing that the device driver - * is in any state that is would also be able to write the error data - * captured to disk, thus we buffer it in NVRAM for analysis on the - * next boot. - * - * In NVRAM the partition containing the error log buffer will looks like: - * Header (in bytes): - * +-----------+----------+--------+------------+------------------+ - * | signature | checksum | length | name | data | - * |0 |1 |2 3|4 15|16 length-1| - * +-----------+----------+--------+------------+------------------+ - * - * The 'data' section would look like (in bytes): - * +--------------+------------+-----------------------------------+ - * | event_logged | sequence # | error log | - * |0 3|4 7|8 nvram_error_log_size-1| - * +--------------+------------+-----------------------------------+ - * - * event_logged: 0 if event has not been logged to syslog, 1 if it has - * sequence #: The unique sequence # for each event. (until it wraps) - * error log: The error log from event_scan - */ -int nvram_write_error_log(char * buff, int length, - unsigned int err_type, unsigned int error_log_cnt) -{ - int rc; - loff_t tmp_index; - struct err_log_info info; - - if (nvram_error_log_index == -1) { - return -ESPIPE; - } - - if (length > nvram_error_log_size) { - length = nvram_error_log_size; - } - - info.error_type = err_type; - info.seq_num = error_log_cnt; - - tmp_index = nvram_error_log_index; - - rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); - return rc; - } - - rc = ppc_md.nvram_write(buff, length, &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_write_error_log: Failed nvram_write (%d)\n", rc); - return rc; - } - - return 0; -} - -/* nvram_read_error_log - * - * Reads nvram for error log for at most 'length' - */ -int nvram_read_error_log(char * buff, int length, - unsigned int * err_type, unsigned int * error_log_cnt) -{ - int rc; - loff_t tmp_index; - struct err_log_info info; - - if (nvram_error_log_index == -1) - return -1; - - if (length > nvram_error_log_size) - length = nvram_error_log_size; - - tmp_index = nvram_error_log_index; - - rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); - return rc; - } - - rc = ppc_md.nvram_read(buff, length, &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); - return rc; - } - - *error_log_cnt = info.seq_num; - *err_type = info.error_type; - - return 0; -} - -/* This doesn't actually zero anything, but it sets the event_logged - * word to tell that this event is safely in syslog. - */ -int nvram_clear_error_log(void) -{ - loff_t tmp_index; - int clear_word = ERR_FLAG_ALREADY_LOGGED; - int rc; - - tmp_index = nvram_error_log_index; - - rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); - if (rc <= 0) { - printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); - return rc; - } - - return 0; -} - -#endif /* CONFIG_PPC_PSERIES */ - module_init(nvram_init); module_exit(nvram_cleanup); MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/powerpc/kernel/pci_32.c linux-2.6.32.ovz/arch/powerpc/kernel/pci_32.c --- linux-2.6.32/arch/powerpc/kernel/pci_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/pci_32.c 2015-03-10 13:23:53.000000000 -0700 @@ -372,7 +372,7 @@ static int __init pcibios_init(void) printk(KERN_INFO "PCI: Probing PCI hardware\n"); - if (ppc_pci_flags & PPC_PCI_REASSIGN_ALL_BUS) + if (pci_has_flag(PCI_REASSIGN_ALL_BUS)) pci_assign_all_buses = 1; /* Scan all of the recorded PCI controllers. */ diff -uprN linux-2.6.32/arch/powerpc/kernel/pci_64.c linux-2.6.32.ovz/arch/powerpc/kernel/pci_64.c --- linux-2.6.32/arch/powerpc/kernel/pci_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/pci_64.c 2015-03-10 13:23:53.000000000 -0700 @@ -32,8 +32,6 @@ #include #include -unsigned long pci_probe_only = 1; - /* pci_io_base -- the base address from which io bars are offsets. * This is the lowest I/O base address (so bar values are always positive), * and it *must* be the start of ISA space if an ISA bus exists because @@ -54,13 +52,10 @@ static int __init pcibios_init(void) */ ppc_md.phys_mem_access_prot = pci_phys_mem_access_prot; - if (pci_probe_only) - ppc_pci_flags |= PPC_PCI_PROBE_ONLY; - /* On ppc64, we always enable PCI domains and we keep domain 0 * backward compatible in /proc for video cards */ - ppc_pci_flags |= PPC_PCI_ENABLE_PROC_DOMAINS | PPC_PCI_COMPAT_DOMAIN_0; + pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); /* Scan all of the recorded PCI controllers. */ list_for_each_entry_safe(hose, tmp, &hose_list, list_node) { @@ -130,30 +125,13 @@ EXPORT_SYMBOL_GPL(pcibios_unmap_io_space #endif /* CONFIG_HOTPLUG */ -int __devinit pcibios_map_io_space(struct pci_bus *bus) +static int __devinit pcibios_map_phb_io_space(struct pci_controller *hose) { struct vm_struct *area; unsigned long phys_page; unsigned long size_page; unsigned long io_virt_offset; - struct pci_controller *hose; - - WARN_ON(bus == NULL); - /* If this not a PHB, nothing to do, page tables still exist and - * thus HPTEs will be faulted in when needed - */ - if (bus->self) { - pr_debug("IO mapping for PCI-PCI bridge %s\n", - pci_name(bus->self)); - pr_debug(" virt=0x%016llx...0x%016llx\n", - bus->resource[0]->start + _IO_BASE, - bus->resource[0]->end + _IO_BASE); - return 0; - } - - /* Get the host bridge */ - hose = pci_bus_to_host(bus); phys_page = _ALIGN_DOWN(hose->io_base_phys, PAGE_SIZE); size_page = _ALIGN_UP(hose->pci_io_size, PAGE_SIZE); @@ -198,11 +176,30 @@ int __devinit pcibios_map_io_space(struc return 0; } + +int __devinit pcibios_map_io_space(struct pci_bus *bus) +{ + WARN_ON(bus == NULL); + + /* If this not a PHB, nothing to do, page tables still exist and + * thus HPTEs will be faulted in when needed + */ + if (bus->self) { + pr_debug("IO mapping for PCI-PCI bridge %s\n", + pci_name(bus->self)); + pr_debug(" virt=0x%016llx...0x%016llx\n", + bus->resource[0]->start + _IO_BASE, + bus->resource[0]->end + _IO_BASE); + return 0; + } + + return pcibios_map_phb_io_space(pci_bus_to_host(bus)); +} EXPORT_SYMBOL_GPL(pcibios_map_io_space); void __devinit pcibios_setup_phb_io_space(struct pci_controller *hose) { - pcibios_map_io_space(hose->bus); + pcibios_map_phb_io_space(hose); } #define IOBASE_BRIDGE_NUMBER 0 diff -uprN linux-2.6.32/arch/powerpc/kernel/pci-common.c linux-2.6.32.ovz/arch/powerpc/kernel/pci-common.c --- linux-2.6.32/arch/powerpc/kernel/pci-common.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/pci-common.c 2015-03-10 13:24:14.000000000 -0700 @@ -46,9 +46,6 @@ static int global_phb_number; /* Global /* ISA Memory physical address */ resource_size_t isa_mem_base; -/* Default PCI flags is 0 on ppc32, modified at boot on ppc64 */ -unsigned int ppc_pci_flags = 0; - static struct dma_map_ops *pci_dma_ops = &dma_direct_ops; @@ -854,67 +851,13 @@ int pci_proc_domain(struct pci_bus *bus) { struct pci_controller *hose = pci_bus_to_host(bus); - if (!(ppc_pci_flags & PPC_PCI_ENABLE_PROC_DOMAINS)) + if (!pci_has_flag(PCI_ENABLE_PROC_DOMAINS)) return 0; - if (ppc_pci_flags & PPC_PCI_COMPAT_DOMAIN_0) + if (pci_has_flag(PCI_COMPAT_DOMAIN_0)) return hose->global_number != 0; return 1; } -void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, - struct resource *res) -{ - resource_size_t offset = 0, mask = (resource_size_t)-1; - struct pci_controller *hose = pci_bus_to_host(dev->bus); - - if (!hose) - return; - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - - region->start = (res->start - offset) & mask; - region->end = (res->end - offset) & mask; -} -EXPORT_SYMBOL(pcibios_resource_to_bus); - -void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, - struct pci_bus_region *region) -{ - resource_size_t offset = 0, mask = (resource_size_t)-1; - struct pci_controller *hose = pci_bus_to_host(dev->bus); - - if (!hose) - return; - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - res->start = (region->start + offset) & mask; - res->end = (region->end + offset) & mask; -} -EXPORT_SYMBOL(pcibios_bus_to_resource); - -/* Fixup a bus resource into a linux resource */ -static void __devinit fixup_resource(struct resource *res, struct pci_dev *dev) -{ - struct pci_controller *hose = pci_bus_to_host(dev->bus); - resource_size_t offset = 0, mask = (resource_size_t)-1; - - if (res->flags & IORESOURCE_IO) { - offset = (unsigned long)hose->io_base_virt - _IO_BASE; - mask = 0xffffffffu; - } else if (res->flags & IORESOURCE_MEM) - offset = hose->pci_mem_offset; - - res->start = (res->start + offset) & mask; - res->end = (res->end + offset) & mask; -} - - /* This header fixup will do the resource fixup for all devices as they are * probed, but not for bridge ranges */ @@ -932,13 +875,13 @@ static void __devinit pcibios_fixup_reso struct resource *res = dev->resource + i; if (!res->flags) continue; - /* On platforms that have PPC_PCI_PROBE_ONLY set, we don't + /* On platforms that have PCI_PROBE_ONLY set, we don't * consider 0 as an unassigned BAR value. It's technically * a valid value, but linux doesn't like it... so when we can * re-assign things, we do so, but if we can't, we keep it * around and hope for the best... */ - if (res->start == 0 && !(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) { + if (res->start == 0 && !pci_has_flag(PCI_PROBE_ONLY)) { pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] is unassigned\n", pci_name(dev), i, (unsigned long long)res->start, @@ -950,18 +893,11 @@ static void __devinit pcibios_fixup_reso continue; } - pr_debug("PCI:%s Resource %d %016llx-%016llx [%x] fixup...\n", + pr_debug("PCI:%s Resource %d %016llx-%016llx [%x]\n", pci_name(dev), i, (unsigned long long)res->start,\ (unsigned long long)res->end, (unsigned int)res->flags); - - fixup_resource(res, dev); - - pr_debug("PCI:%s %016llx-%016llx\n", - pci_name(dev), - (unsigned long long)res->start, - (unsigned long long)res->end); } /* Call machine specific resource fixup */ @@ -985,7 +921,7 @@ static int __devinit pcibios_uninitializ int i; /* We don't do anything if PCI_PROBE_ONLY is set */ - if (ppc_pci_flags & PPC_PCI_PROBE_ONLY) + if (pci_has_flag(PCI_PROBE_ONLY)) return 0; /* Job is a bit different between memory and IO */ @@ -1047,35 +983,24 @@ static void __devinit pcibios_fixup_brid struct pci_dev *dev = bus->self; - for (i = 0; i < PCI_BUS_NUM_RESOURCES; ++i) { - if ((res = bus->resource[i]) == NULL) - continue; - if (!res->flags) + pci_bus_for_each_resource(bus, res, i) { + if (!res || !res->flags) continue; if (i >= 3 && bus->self->transparent) continue; - pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x] fixup...\n", + pr_debug("PCI:%s Bus rsrc %d %016llx-%016llx [%x]\n", pci_name(dev), i, (unsigned long long)res->start,\ (unsigned long long)res->end, (unsigned int)res->flags); - /* Perform fixup */ - fixup_resource(res, dev); - /* Try to detect uninitialized P2P bridge resources, * and clear them out so they get re-assigned later */ if (pcibios_uninitialized_bridge_resource(bus, res)) { res->flags = 0; pr_debug("PCI:%s (unassigned)\n", pci_name(dev)); - } else { - - pr_debug("PCI:%s %016llx-%016llx\n", - pci_name(dev), - (unsigned long long)res->start, - (unsigned long long)res->end); } } } @@ -1097,6 +1022,43 @@ void __devinit pcibios_setup_bus_self(st ppc_md.pci_dma_bus_setup(bus); } +static void pcibios_setup_device(struct pci_dev *dev) +{ + struct dev_archdata *sd = &dev->dev.archdata; + + /* Setup OF node pointer in archdata */ + sd->of_node = pci_device_to_OF_node(dev); + + /* Fixup NUMA node as it may not be setup yet by the generic + * code and is needed by the DMA init + */ + set_dev_node(&dev->dev, pcibus_to_node(dev->bus)); + + /* Hook up default DMA ops */ + set_dma_ops(&dev->dev, pci_dma_ops); + set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); + + /* Additional platform DMA/iommu setup */ + if (ppc_md.pci_dma_dev_setup) + ppc_md.pci_dma_dev_setup(dev); + + /* Read default IRQs and fixup if necessary */ + pci_read_irq_line(dev); + if (ppc_md.pci_irq_fixup) + ppc_md.pci_irq_fixup(dev); +} + +int pcibios_add_device(struct pci_dev *dev) +{ + /* + * We can only call pcibios_setup_device() after bus setup is complete, + * since some of the platform specific DMA setup code depends on it. + */ + if (dev->bus->is_added) + pcibios_setup_device(dev); + return 0; +} + void __devinit pcibios_setup_bus_devices(struct pci_bus *bus) { struct pci_dev *dev; @@ -1105,28 +1067,13 @@ void __devinit pcibios_setup_bus_devices bus->number, bus->self ? pci_name(bus->self) : "PHB"); list_for_each_entry(dev, &bus->devices, bus_list) { - struct dev_archdata *sd = &dev->dev.archdata; - - /* Setup OF node pointer in archdata */ - sd->of_node = pci_device_to_OF_node(dev); - - /* Fixup NUMA node as it may not be setup yet by the generic - * code and is needed by the DMA init + /* Cardbus can call us to add new devices to a bus, so ignore + * those who are already fully discovered */ - set_dev_node(&dev->dev, pcibus_to_node(dev->bus)); + if (dev->is_added) + continue; - /* Hook up default DMA ops */ - sd->dma_ops = pci_dma_ops; - set_dma_offset(&dev->dev, PCI_DRAM_OFFSET); - - /* Additional platform DMA/iommu setup */ - if (ppc_md.pci_dma_dev_setup) - ppc_md.pci_dma_dev_setup(dev); - - /* Read default IRQs and fixup if necessary */ - pci_read_irq_line(dev); - if (ppc_md.pci_irq_fixup) - ppc_md.pci_irq_fixup(dev); + pcibios_setup_device(dev); } } @@ -1147,9 +1094,16 @@ void __devinit pcibios_fixup_bus(struct } EXPORT_SYMBOL(pcibios_fixup_bus); +void __devinit pci_fixup_cardbus(struct pci_bus *bus) +{ + /* Now fixup devices on that bus */ + pcibios_setup_bus_devices(bus); +} + + static int skip_isa_ioresource_align(struct pci_dev *dev) { - if ((ppc_pci_flags & PPC_PCI_CAN_SKIP_ISA_ALIGN) && + if (pci_has_flag(PCI_CAN_SKIP_ISA_ALIGN) && !(dev->bus->bridge_ctl & PCI_BRIDGE_CTL_ISA)) return 1; return 0; @@ -1265,9 +1219,8 @@ void pcibios_allocate_bus_resources(stru pr_debug("PCI: Allocating bus resources for %04x:%02x...\n", pci_domain_nr(bus), bus->number); - for (i = 0; i < PCI_BUS_NUM_RESOURCES; ++i) { - if ((res = bus->resource[i]) == NULL || !res->flags - || res->start > res->end || res->parent) + pci_bus_for_each_resource(bus, res, i) { + if (!res || !res->flags || res->start > res->end || res->parent) continue; if (bus->parent == NULL) pr = (res->flags & IORESOURCE_IO) ? @@ -1279,7 +1232,7 @@ void pcibios_allocate_bus_resources(stru * and as such ensure proper re-allocation * later. */ - if (ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC) + if (pci_has_flag(PCI_REASSIGN_ALL_RSRC)) goto clear_resource; pr = pci_find_parent_resource(bus->self, res); if (pr == res) { @@ -1463,7 +1416,7 @@ void __init pcibios_resource_survey(void list_for_each_entry(b, &pci_root_buses, node) pcibios_allocate_bus_resources(b); - if (!(ppc_pci_flags & PPC_PCI_REASSIGN_ALL_RSRC)) { + if (!pci_has_flag(PCI_REASSIGN_ALL_RSRC)) { pcibios_allocate_resources(0); pcibios_allocate_resources(1); } @@ -1472,7 +1425,7 @@ void __init pcibios_resource_survey(void * the low IO area and the VGA memory area if they intersect the * bus available resources to avoid allocating things on top of them */ - if (!(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) { + if (!pci_has_flag(PCI_PROBE_ONLY)) { list_for_each_entry(b, &pci_root_buses, node) pcibios_reserve_legacy_regions(b); } @@ -1480,7 +1433,7 @@ void __init pcibios_resource_survey(void /* Now, if the platform didn't decide to blindly trust the firmware, * we proceed to assigning things that were left unassigned */ - if (!(ppc_pci_flags & PPC_PCI_PROBE_ONLY)) { + if (!pci_has_flag(PCI_PROBE_ONLY)) { pr_debug("PCI: Assigning unassigned resources...\n"); pci_assign_unassigned_resources(); } @@ -1561,14 +1514,13 @@ int pcibios_enable_device(struct pci_dev return pci_enable_resources(dev, mask); } -void __devinit pcibios_setup_phb_resources(struct pci_controller *hose) +static void __devinit pcibios_setup_phb_resources(struct pci_controller *hose, struct list_head *resources) { - struct pci_bus *bus = hose->bus; struct resource *res; int i; /* Hookup PHB IO resource */ - bus->resource[0] = res = &hose->io_resource; + res = &hose->io_resource; if (!res->flags) { printk(KERN_WARNING "PCI: I/O resource not set for host" @@ -1586,6 +1538,8 @@ void __devinit pcibios_setup_phb_resourc (unsigned long long)res->start, (unsigned long long)res->end, (unsigned long)res->flags); + pci_add_resource_offset(resources, res, + (resource_size_t) hose->io_base_virt - _IO_BASE); /* Hookup PHB Memory resources */ for (i = 0; i < 3; ++i) { @@ -1603,12 +1557,12 @@ void __devinit pcibios_setup_phb_resourc res->flags = IORESOURCE_MEM; #endif /* CONFIG_PPC32 */ } - bus->resource[i+1] = res; pr_debug("PCI: PHB MEM resource %d = %016llx-%016llx [%lx]\n", i, (unsigned long long)res->start, (unsigned long long)res->end, (unsigned long)res->flags); + pci_add_resource_offset(resources, res, hose->pci_mem_offset); } pr_debug("PCI: PHB MEM offset = %016llx\n", @@ -1700,6 +1654,7 @@ int early_find_capability(struct pci_con */ void __devinit pcibios_scan_phb(struct pci_controller *hose, void *sysdata) { + LIST_HEAD(resources); struct pci_bus *bus; struct device_node *node = hose->dn; int mode; @@ -1707,23 +1662,24 @@ void __devinit pcibios_scan_phb(struct p pr_debug("PCI: Scanning PHB %s\n", node ? node->full_name : ""); + /* Get some IO space for the new PHB */ + pcibios_setup_phb_io_space(hose); + + /* Wire up PHB bus resources */ + pcibios_setup_phb_resources(hose, &resources); + /* Create an empty bus for the toplevel */ - bus = pci_create_bus(hose->parent, hose->first_busno, hose->ops, - sysdata); + bus = pci_create_root_bus(hose->parent, hose->first_busno, + hose->ops, sysdata, &resources); if (bus == NULL) { pr_err("Failed to create bus for PCI domain %04x\n", hose->global_number); + pci_free_resource_list(&resources); return; } bus->secondary = hose->first_busno; hose->bus = bus; - /* Get some IO space for the new PHB */ - pcibios_setup_phb_io_space(hose); - - /* Wire up PHB bus resources */ - pcibios_setup_phb_resources(hose); - /* Get probe mode and perform scan */ mode = PCI_PROBE_NORMAL; if (node && ppc_md.pci_probe_mode) diff -uprN linux-2.6.32/arch/powerpc/kernel/pci_of_scan.c linux-2.6.32.ovz/arch/powerpc/kernel/pci_of_scan.c --- linux-2.6.32/arch/powerpc/kernel/pci_of_scan.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/pci_of_scan.c 2015-03-10 13:23:53.000000000 -0700 @@ -74,6 +74,7 @@ static void of_pci_parse_addrs(struct de { u64 base, size; unsigned int flags; + struct pci_bus_region region; struct resource *res; const u32 *addrs; u32 i; @@ -105,10 +106,11 @@ static void of_pci_parse_addrs(struct de printk(KERN_ERR "PCI: bad cfg reg num 0x%x\n", i); continue; } - res->start = base; - res->end = base + size - 1; res->flags = flags; res->name = pci_name(dev); + region.start = base; + region.end = base + size - 1; + pcibios_bus_to_resource(dev, res, ®ion); } } @@ -123,6 +125,7 @@ struct pci_dev *of_create_pci_dev(struct { struct pci_dev *dev; const char *type; + struct pci_slot *slot; dev = alloc_pci_dev(); if (!dev) @@ -140,6 +143,11 @@ struct pci_dev *of_create_pci_dev(struct dev->devfn = devfn; dev->multifunction = 0; /* maybe a lie? */ dev->needs_freset = 0; /* pcie fundamental reset required */ + set_pcie_port_type(dev); + + list_for_each_entry(slot, &dev->bus->slots, list) + if (PCI_SLOT(dev->devfn) == slot->number) + dev->slot = slot; dev->vendor = get_int_prop(node, "vendor-id", 0xffff); dev->device = get_int_prop(node, "device-id", 0xffff); @@ -160,10 +168,14 @@ struct pci_dev *of_create_pci_dev(struct dev->error_state = pci_channel_io_normal; dev->dma_mask = 0xffffffff; + /* Early fixups, before probing the BARs */ + pci_fixup_device(pci_fixup_early, dev); + if (!strcmp(type, "pci") || !strcmp(type, "pciex")) { /* a PCI-PCI bridge */ dev->hdr_type = PCI_HEADER_TYPE_BRIDGE; dev->rom_base_reg = PCI_ROM_ADDRESS1; + set_pcie_hotplug_bridge(dev); } else if (!strcmp(type, "cardbus")) { dev->hdr_type = PCI_HEADER_TYPE_CARDBUS; } else { @@ -198,6 +210,7 @@ void __devinit of_scan_pci_bridge(struct struct pci_bus *bus; const u32 *busrange, *ranges; int len, i, mode; + struct pci_bus_region region; struct resource *res; unsigned int flags; u64 size; @@ -260,9 +273,10 @@ void __devinit of_scan_pci_bridge(struct res = bus->resource[i]; ++i; } - res->start = of_read_number(&ranges[1], 2); - res->end = res->start + size - 1; res->flags = flags; + region.start = of_read_number(&ranges[1], 2); + region.end = region.start + size - 1; + pcibios_bus_to_resource(dev, res, ®ion); } sprintf(bus->name, "PCI Bus %04x:%02x", pci_domain_nr(bus), bus->number); @@ -300,6 +314,8 @@ static void __devinit __of_scan_bus(stru /* Scan direct children */ for_each_child_of_node(node, child) { pr_debug(" * %s\n", child->full_name); + if (!of_device_is_available(child)) + continue; reg = of_get_property(child, "reg", ®len); if (reg == NULL || reglen < 20) continue; diff -uprN linux-2.6.32/arch/powerpc/kernel/perf_callchain.c linux-2.6.32.ovz/arch/powerpc/kernel/perf_callchain.c --- linux-2.6.32/arch/powerpc/kernel/perf_callchain.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/perf_callchain.c 2015-03-10 13:23:24.000000000 -0700 @@ -23,18 +23,6 @@ #include "ppc32.h" #endif -/* - * Store another value in a callchain_entry. - */ -static inline void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - unsigned int nr = entry->nr; - - if (nr < PERF_MAX_STACK_DEPTH) { - entry->ip[nr] = ip; - entry->nr = nr + 1; - } -} /* * Is sp valid as the address of the next kernel stack frame after prev_sp? @@ -58,8 +46,8 @@ static int valid_next_sp(unsigned long s return 0; } -static void perf_callchain_kernel(struct pt_regs *regs, - struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -69,8 +57,7 @@ static void perf_callchain_kernel(struct lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->nip); + perf_callchain_store(entry, perf_instruction_pointer(regs)); if (!validate_sp(sp, current, STACK_FRAME_OVERHEAD)) return; @@ -89,7 +76,7 @@ static void perf_callchain_kernel(struct next_ip = regs->nip; lr = regs->link; level = 0; - callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); } else { if (level == 0) @@ -111,7 +98,7 @@ static void perf_callchain_kernel(struct ++level; } - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); if (!valid_next_sp(next_sp, sp)) return; sp = next_sp; @@ -177,8 +164,12 @@ static int read_user_stack_64(unsigned l ((unsigned long)ptr & 7)) return -EFAULT; - if (!__get_user_inatomic(*ret, ptr)) + pagefault_disable(); + if (!__get_user_inatomic(*ret, ptr)) { + pagefault_enable(); return 0; + } + pagefault_enable(); return read_user_stack_slow(ptr, ret, 8); } @@ -189,8 +180,12 @@ static int read_user_stack_32(unsigned i ((unsigned long)ptr & 3)) return -EFAULT; - if (!__get_user_inatomic(*ret, ptr)) + pagefault_disable(); + if (!__get_user_inatomic(*ret, ptr)) { + pagefault_enable(); return 0; + } + pagefault_enable(); return read_user_stack_slow(ptr, ret, 4); } @@ -243,8 +238,8 @@ static int sane_signal_64_frame(unsigned puc == (unsigned long) &sf->uc; } -static void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned long sp, next_sp; unsigned long next_ip; @@ -253,11 +248,8 @@ static void perf_callchain_user_64(struc struct signal_frame_64 __user *sigframe; unsigned long __user *fp, *uregs; - next_ip = regs->nip; lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); for (;;) { fp = (unsigned long __user *) sp; @@ -286,14 +278,14 @@ static void perf_callchain_user_64(struc read_user_stack_64(&uregs[PT_R1], &sp)) return; level = 0; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store(entry, next_ip); continue; } if (level == 0) next_ip = lr; - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); ++level; sp = next_sp; } @@ -318,15 +310,21 @@ static inline int current_is_64bit(void) */ static int read_user_stack_32(unsigned int __user *ptr, unsigned int *ret) { + int rc; + if ((unsigned long)ptr > TASK_SIZE - sizeof(unsigned int) || ((unsigned long)ptr & 3)) return -EFAULT; - return __get_user_inatomic(*ret, ptr); + pagefault_disable(); + rc = __get_user_inatomic(*ret, ptr); + pagefault_enable(); + + return rc; } -static inline void perf_callchain_user_64(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static inline void perf_callchain_user_64(struct perf_callchain_entry *entry, + struct pt_regs *regs) { } @@ -445,8 +443,8 @@ static unsigned int __user *signal_frame return mctx->mc_gregs; } -static void perf_callchain_user_32(struct pt_regs *regs, - struct perf_callchain_entry *entry) +static void perf_callchain_user_32(struct perf_callchain_entry *entry, + struct pt_regs *regs) { unsigned int sp, next_sp; unsigned int next_ip; @@ -454,11 +452,8 @@ static void perf_callchain_user_32(struc long level = 0; unsigned int __user *fp, *uregs; - next_ip = regs->nip; lr = regs->link; sp = regs->gpr[1]; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); while (entry->nr < PERF_MAX_STACK_DEPTH) { fp = (unsigned int __user *) (unsigned long) sp; @@ -480,48 +475,29 @@ static void perf_callchain_user_32(struc read_user_stack_32(&uregs[PT_R1], &sp)) return; level = 0; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, next_ip); + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_store(entry, next_ip); continue; } if (level == 0) next_ip = lr; - callchain_store(entry, next_ip); + perf_callchain_store(entry, next_ip); ++level; sp = next_sp; } } -/* - * Since we can't get PMU interrupts inside a PMU interrupt handler, - * we don't need separate irq and nmi entries here. - */ -static DEFINE_PER_CPU(struct perf_callchain_entry, callchain); - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { - struct perf_callchain_entry *entry = &__get_cpu_var(callchain); + perf_callchain_store(entry, perf_instruction_pointer(regs)); - entry->nr = 0; + if (!current->mm) + return; - if (current->pid == 0) /* idle task? */ - return entry; - - if (!user_mode(regs)) { - perf_callchain_kernel(regs, entry); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - if (current_is_64bit()) - perf_callchain_user_64(regs, entry); - else - perf_callchain_user_32(regs, entry); - } - - return entry; + if (current_is_64bit()) + perf_callchain_user_64(entry, regs); + else + perf_callchain_user_32(entry, regs); } diff -uprN linux-2.6.32/arch/powerpc/kernel/perf_event.c linux-2.6.32.ovz/arch/powerpc/kernel/perf_event.c --- linux-2.6.32/arch/powerpc/kernel/perf_event.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/perf_event.c 2015-03-10 13:24:07.000000000 -0700 @@ -35,6 +35,9 @@ struct cpu_hw_events { u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; + + unsigned int group_flag; + int n_txn_start; }; DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); @@ -70,14 +73,27 @@ static inline u32 perf_get_misc_flags(st { return 0; } -static inline void perf_read_regs(struct pt_regs *regs) { } +static inline void perf_read_regs(struct pt_regs *regs) +{ + regs->result = 0; +} static inline int perf_intr_is_nmi(struct pt_regs *regs) { return 0; } +static inline int siar_valid(struct pt_regs *regs) +{ + return 1; +} + #endif /* CONFIG_PPC32 */ +static bool regs_use_siar(struct pt_regs *regs) +{ + return !!(regs->result & 1); +} + /* * Things that are specific to 64-bit implementations. */ @@ -87,11 +103,12 @@ static inline unsigned long perf_ip_adju { unsigned long mmcra = regs->dsisr; - if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { + if ((ppmu->flags & PPMU_HAS_SSLOT) && (mmcra & MMCRA_SAMPLE_ENABLE)) { unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; if (slot > 1) return 4 * (slot - 1); } + return 0; } @@ -100,48 +117,155 @@ static inline unsigned long perf_ip_adju * If we're not doing instruction sampling, give them the SDAR * (sampled data address). If we are doing instruction sampling, then * only give them the SDAR if it corresponds to the instruction - * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC - * bit in MMCRA. + * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC or + * the [POWER7P_]MMCRA_SDAR_VALID bit in MMCRA. */ static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { unsigned long mmcra = regs->dsisr; - unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? - POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; + unsigned long sdsync; + + if (ppmu->flags & PPMU_SIAR_VALID) + sdsync = POWER7P_MMCRA_SDAR_VALID; + else if (ppmu->flags & PPMU_ALT_SIPR) + sdsync = POWER6_MMCRA_SDSYNC; + else + sdsync = MMCRA_SDSYNC; if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) *addrp = mfspr(SPRN_SDAR); } -static inline u32 perf_get_misc_flags(struct pt_regs *regs) +static bool regs_sihv(struct pt_regs *regs) { - unsigned long mmcra = regs->dsisr; unsigned long sihv = MMCRA_SIHV; - unsigned long sipr = MMCRA_SIPR; - if (TRAP(regs) != 0xf00) - return 0; /* not a PMU interrupt */ + if (ppmu->flags & PPMU_HAS_SIER) + return !!(regs->dar & SIER_SIHV); - if (ppmu->flags & PPMU_ALT_SIPR) { + if (ppmu->flags & PPMU_ALT_SIPR) sihv = POWER6_MMCRA_SIHV; + + return !!(regs->dsisr & sihv); +} + +static bool regs_sipr(struct pt_regs *regs) +{ + unsigned long sipr = MMCRA_SIPR; + + if (ppmu->flags & PPMU_HAS_SIER) + return !!(regs->dar & SIER_SIPR); + + if (ppmu->flags & PPMU_ALT_SIPR) sipr = POWER6_MMCRA_SIPR; + + return !!(regs->dsisr & sipr); +} + +static bool regs_no_sipr(struct pt_regs *regs) +{ + return !!(regs->result & 2); +} + +static inline u32 perf_flags_from_msr(struct pt_regs *regs) +{ + if (regs->msr & MSR_PR) + return PERF_RECORD_MISC_USER; + if ((regs->msr & MSR_HV) && freeze_events_kernel != MMCR0_FCHV) + return PERF_RECORD_MISC_HYPERVISOR; + return PERF_RECORD_MISC_KERNEL; +} + +static inline u32 perf_get_misc_flags(struct pt_regs *regs) +{ + bool use_siar = regs_use_siar(regs); + + if (!use_siar) + return perf_flags_from_msr(regs); + + /* + * If we don't have flags in MMCRA, rather than using + * the MSR, we intuit the flags from the address in + * SIAR which should give slightly more reliable + * results + */ + if (regs_no_sipr(regs)) { + unsigned long siar = mfspr(SPRN_SIAR); + if (siar >= PAGE_OFFSET) + return PERF_RECORD_MISC_KERNEL; + return PERF_RECORD_MISC_USER; } /* PR has priority over HV, so order below is important */ - if (mmcra & sipr) + if (regs_sipr(regs)) return PERF_RECORD_MISC_USER; - if ((mmcra & sihv) && (freeze_events_kernel != MMCR0_FCHV)) + + if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV)) return PERF_RECORD_MISC_HYPERVISOR; + return PERF_RECORD_MISC_KERNEL; } /* * Overload regs->dsisr to store MMCRA so we only need to read it once * on each interrupt. + * Overload regs->dar to store SIER if we have it. + * Overload regs->result to specify whether we should use the MSR (result + * is zero) or the SIAR (result is non zero). */ static inline void perf_read_regs(struct pt_regs *regs) { - regs->dsisr = mfspr(SPRN_MMCRA); + unsigned long mmcra = mfspr(SPRN_MMCRA); + int marked = mmcra & MMCRA_SAMPLE_ENABLE; + int use_siar; + + regs->dsisr = mmcra; + regs->result = 0; + + if (ppmu->flags & PPMU_NO_SIPR) + regs->result |= 2; + + /* + * On power8 if we're in random sampling mode, the SIER is updated. + * If we're in continuous sampling mode, we don't have SIPR. + */ + if (ppmu->flags & PPMU_HAS_SIER) { + if (marked) + regs->dar = mfspr(SPRN_SIER); + else + regs->result |= 2; + } + + + /* + * If this isn't a PMU exception (eg a software event) the SIAR is + * not valid. Use pt_regs. + * + * If it is a marked event use the SIAR. + * + * If the PMU doesn't update the SIAR for non marked events use + * pt_regs. + * + * If the PMU has HV/PR flags then check to see if they + * place the exception in userspace. If so, use pt_regs. In + * continuous sampling mode the SIAR and the PMU exception are + * not synchronised, so they may be many instructions apart. + * This can result in confusing backtraces. We still want + * hypervisor samples as well as samples in the kernel with + * interrupts off hence the userspace check. + */ + if (TRAP(regs) != 0xf00) + use_siar = 0; + else if (marked) + use_siar = 1; + else if ((ppmu->flags & PPMU_NO_CONT_SAMPLING)) + use_siar = 0; + else if (!regs_no_sipr(regs) && regs_sipr(regs)) + use_siar = 0; + else + use_siar = 1; + + regs->result |= use_siar; } /* @@ -153,6 +277,24 @@ static inline int perf_intr_is_nmi(struc return !regs->softe; } +/* + * On processors like P7+ that have the SIAR-Valid bit, marked instructions + * must be sampled only if the SIAR-valid bit is set. + * + * For unmarked instructions and for processors that don't have the SIAR-Valid + * bit, assume that SIAR is valid. + */ +static inline int siar_valid(struct pt_regs *regs) +{ + unsigned long mmcra = regs->dsisr; + int marked = mmcra & MMCRA_SAMPLE_ENABLE; + + if ((ppmu->flags & PPMU_SIAR_VALID) && marked) + return mmcra & POWER7P_MMCRA_SIAR_VALID; + + return 1; +} + #endif /* CONFIG_PPC64 */ static void perf_event_interrupt(struct pt_regs *regs); @@ -395,10 +537,32 @@ static int check_excludes(struct perf_ev return 0; } +static u64 check_and_compute_delta(u64 prev, u64 val) +{ + u64 delta = (val - prev) & 0xfffffffful; + + /* + * POWER7 can roll back counter values, if the new value is smaller + * than the previous value it will cause the delta and the counter to + * have bogus values unless we rolled a counter over. If a coutner is + * rolled back, it will be smaller, but within 256, which is the maximum + * number of events to rollback at once. If we dectect a rollback + * return 0. This can lead to a small lack of precision in the + * counters. + */ + if (prev > val && (prev - val) < 256) + delta = 0; + + return delta; +} + static void power_pmu_read(struct perf_event *event) { s64 val, delta, prev; + if (event->hw.state & PERF_HES_STOPPED) + return; + if (!event->hw.idx) return; /* @@ -407,15 +571,16 @@ static void power_pmu_read(struct perf_e * Therefore we treat them like NMIs. */ do { - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); barrier(); val = read_pmc(event->hw.idx); - } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + delta = check_and_compute_delta(prev, val); + if (!delta) + return; + } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); - /* The counters are only 32 bits wide */ - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &event->hw.period_left); + local64_add(delta, &event->count); + local64_sub(delta, &event->hw.period_left); } /* @@ -441,10 +606,11 @@ static void freeze_limited_counters(stru if (!event->hw.idx) continue; val = (event->hw.idx == 5) ? pmc5 : pmc6; - prev = atomic64_read(&event->hw.prev_count); + prev = local64_read(&event->hw.prev_count); event->hw.idx = 0; - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + delta = check_and_compute_delta(prev, val); + if (delta) + local64_add(delta, &event->count); } } @@ -452,14 +618,16 @@ static void thaw_limited_counters(struct unsigned long pmc5, unsigned long pmc6) { struct perf_event *event; - u64 val; + u64 val, prev; int i; for (i = 0; i < cpuhw->n_limited; ++i) { event = cpuhw->limited_counter[i]; event->hw.idx = cpuhw->limited_hwidx[i]; val = (event->hw.idx == 5) ? pmc5 : pmc6; - atomic64_set(&event->hw.prev_count, val); + prev = local64_read(&event->hw.prev_count); + if (check_and_compute_delta(prev, val)) + local64_set(&event->hw.prev_count, val); perf_event_update_userpage(event); } } @@ -514,7 +682,7 @@ static void write_mmcr0(struct cpu_hw_ev * Disable all events to prevent PMU interrupts and to allow * events to be added or removed. */ -void hw_perf_disable(void) +static void power_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuhw; unsigned long flags; @@ -562,7 +730,7 @@ void hw_perf_disable(void) * If we were previously disabled and events were added, then * put the new config on the PMU. */ -void hw_perf_enable(void) +static void power_pmu_enable(struct pmu *pmu) { struct perf_event *event; struct cpu_hw_events *cpuhw; @@ -663,12 +831,14 @@ void hw_perf_enable(void) } val = 0; if (event->hw.sample_period) { - left = atomic64_read(&event->hw.period_left); + left = local64_read(&event->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; } - atomic64_set(&event->hw.prev_count, val); + local64_set(&event->hw.prev_count, val); event->hw.idx = idx; + if (event->hw.state & PERF_HES_STOPPED) + val = 0; write_pmc(idx, val); perf_event_update_userpage(event); } @@ -718,73 +888,13 @@ static int collect_events(struct perf_ev return n; } -static void event_sched_in(struct perf_event *event, int cpu) -{ - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = cpu; - event->tstamp_running += event->ctx->time - event->tstamp_stopped; - if (is_software_event(event)) - event->pmu->enable(event); -} - -/* - * Called to enable a whole group of events. - * Returns 1 if the group was enabled, or -EAGAIN if it could not be. - * Assumes the caller has disabled interrupts and has - * frozen the PMU with hw_perf_save_disable. - */ -int hw_perf_group_sched_in(struct perf_event *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx, int cpu) -{ - struct cpu_hw_events *cpuhw; - long i, n, n0; - struct perf_event *sub; - - if (!ppmu) - return 0; - cpuhw = &__get_cpu_var(cpu_hw_events); - n0 = cpuhw->n_events; - n = collect_events(group_leader, ppmu->n_counter - n0, - &cpuhw->event[n0], &cpuhw->events[n0], - &cpuhw->flags[n0]); - if (n < 0) - return -EAGAIN; - if (check_excludes(cpuhw->event, cpuhw->flags, n0, n)) - return -EAGAIN; - i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0); - if (i < 0) - return -EAGAIN; - cpuhw->n_events = n0 + n; - cpuhw->n_added += n; - - /* - * OK, this group can go on; update event states etc., - * and enable any software events - */ - for (i = n0; i < n0 + n; ++i) - cpuhw->event[i]->hw.config = cpuhw->events[i]; - cpuctx->active_oncpu += n; - n = 1; - event_sched_in(group_leader, cpu); - list_for_each_entry(sub, &group_leader->sibling_list, group_entry) { - if (sub->state != PERF_EVENT_STATE_OFF) { - event_sched_in(sub, cpu); - ++n; - } - } - ctx->nr_active += n; - - return 1; -} - /* * Add a event to the PMU. * If all events are not already frozen, then we disable and * re-enable the PMU in order to get hw_perf_enable to do the * actual work of reconfiguring the PMU. */ -static int power_pmu_enable(struct perf_event *event) +static int power_pmu_add(struct perf_event *event, int ef_flags) { struct cpu_hw_events *cpuhw; unsigned long flags; @@ -792,7 +902,7 @@ static int power_pmu_enable(struct perf_ int ret = -EAGAIN; local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); /* * Add the event to the list (if there is room) @@ -805,18 +915,39 @@ static int power_pmu_enable(struct perf_ cpuhw->event[n0] = event; cpuhw->events[n0] = event->hw.config; cpuhw->flags[n0] = event->hw.event_base; + + /* + * This event may have been disabled/stopped in record_and_restart() + * because we exceeded the ->event_limit. If re-starting the event, + * clear the ->hw.state (STOPPED and UPTODATE flags), so the user + * notification is re-enabled. + */ + if (!(ef_flags & PERF_EF_START)) + event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + else + event->hw.state = 0; + + /* + * If group events scheduling transaction was started, + * skip the schedulability test here, it will be peformed + * at commit time(->commit_txn) as a whole + */ + if (cpuhw->group_flag & PERF_EVENT_TXN) + goto nocheck; + if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) goto out; if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) goto out; - event->hw.config = cpuhw->events[n0]; + +nocheck: ++cpuhw->n_events; ++cpuhw->n_added; ret = 0; out: - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); return ret; } @@ -824,22 +955,25 @@ static int power_pmu_enable(struct perf_ /* * Remove a event from the PMU. */ -static void power_pmu_disable(struct perf_event *event) +static void power_pmu_del(struct perf_event *event, int ef_flags) { struct cpu_hw_events *cpuhw; long i; unsigned long flags; local_irq_save(flags); - perf_disable(); + perf_pmu_disable(event->pmu); power_pmu_read(event); cpuhw = &__get_cpu_var(cpu_hw_events); for (i = 0; i < cpuhw->n_events; ++i) { if (event == cpuhw->event[i]) { - while (++i < cpuhw->n_events) + while (++i < cpuhw->n_events) { cpuhw->event[i-1] = cpuhw->event[i]; + cpuhw->events[i-1] = cpuhw->events[i]; + cpuhw->flags[i-1] = cpuhw->flags[i]; + } --cpuhw->n_events; ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr); if (event->hw.idx) { @@ -865,43 +999,123 @@ static void power_pmu_disable(struct per cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); } - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); } /* - * Re-enable interrupts on a event after they were throttled - * because they were coming too fast. + * POWER-PMU does not support disabling individual counters, hence + * program their cycle counter to their max value and ignore the interrupts. */ -static void power_pmu_unthrottle(struct perf_event *event) + +static void power_pmu_start(struct perf_event *event, int ef_flags) { - s64 val, left; unsigned long flags; + s64 left; + unsigned long val; if (!event->hw.idx || !event->hw.sample_period) return; + + if (!(event->hw.state & PERF_HES_STOPPED)) + return; + + if (ef_flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + local_irq_save(flags); - perf_disable(); - power_pmu_read(event); - left = event->hw.sample_period; - event->hw.last_period = left; + perf_pmu_disable(event->pmu); + + event->hw.state = 0; + left = local64_read(&event->hw.period_left); + val = 0; if (left < 0x80000000L) val = 0x80000000L - left; + write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); - perf_enable(); + perf_pmu_enable(event->pmu); local_irq_restore(flags); } -struct pmu power_pmu = { - .enable = power_pmu_enable, - .disable = power_pmu_disable, - .read = power_pmu_read, - .unthrottle = power_pmu_unthrottle, -}; +static void power_pmu_stop(struct perf_event *event, int ef_flags) +{ + unsigned long flags; + + if (!event->hw.idx || !event->hw.sample_period) + return; + + if (event->hw.state & PERF_HES_STOPPED) + return; + + local_irq_save(flags); + perf_pmu_disable(event->pmu); + + power_pmu_read(event); + event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + write_pmc(event->hw.idx, 0); + + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + local_irq_restore(flags); +} + +/* + * Start group events scheduling transaction + * Set the flag to make pmu::enable() not perform the + * schedulability test, it will be performed at commit time + */ +void power_pmu_start_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + perf_pmu_disable(pmu); + cpuhw->group_flag |= PERF_EVENT_TXN; + cpuhw->n_txn_start = cpuhw->n_events; +} + +/* + * Stop group events scheduling transaction + * Clear the flag and pmu::enable() will perform the + * schedulability test. + */ +void power_pmu_cancel_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + cpuhw->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); +} + +/* + * Commit group events scheduling transaction + * Perform the group schedulability test as a whole + * Return 0 if success + */ +int power_pmu_commit_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw; + long i, n; + + if (!ppmu) + return -EAGAIN; + cpuhw = &__get_cpu_var(cpu_hw_events); + n = cpuhw->n_events; + if (check_excludes(cpuhw->event, cpuhw->flags, 0, n)) + return -EAGAIN; + i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n); + if (i < 0) + return -EAGAIN; + + for (i = cpuhw->n_txn_start; i < n; ++i) + cpuhw->event[i]->hw.config = cpuhw->events[i]; + + cpuhw->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); + return 0; +} /* * Return 1 if we might be able to put event on a limited PMC, @@ -1003,7 +1217,7 @@ static int hw_perf_cache_event(u64 confi return 0; } -const struct pmu *hw_perf_event_init(struct perf_event *event) +static int power_pmu_event_init(struct perf_event *event) { u64 ev; unsigned long flags; @@ -1015,25 +1229,31 @@ const struct pmu *hw_perf_event_init(str struct cpu_hw_events *cpuhw; if (!ppmu) - return ERR_PTR(-ENXIO); + return -ENOENT; + + /* does not support taken branch sampling */ + if (has_branch_stack(event)) + return -EOPNOTSUPP; + switch (event->attr.type) { case PERF_TYPE_HARDWARE: ev = event->attr.config; if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return ERR_PTR(-EOPNOTSUPP); + return -EOPNOTSUPP; ev = ppmu->generic_events[ev]; break; case PERF_TYPE_HW_CACHE: err = hw_perf_cache_event(event->attr.config, &ev); if (err) - return ERR_PTR(err); + return err; break; case PERF_TYPE_RAW: ev = event->attr.config; break; default: - return ERR_PTR(-EINVAL); + return -ENOENT; } + event->hw.config_base = ev; event->hw.idx = 0; @@ -1052,7 +1272,7 @@ const struct pmu *hw_perf_event_init(str * XXX we should check if the task is an idle task. */ flags = 0; - if (event->ctx->task) + if (event->attach_state & PERF_ATTACH_TASK) flags |= PPMU_ONLY_COUNT_RUN; /* @@ -1070,7 +1290,7 @@ const struct pmu *hw_perf_event_init(str */ ev = normal_pmc_alternative(ev, flags); if (!ev) - return ERR_PTR(-EINVAL); + return -EINVAL; } } @@ -1084,24 +1304,24 @@ const struct pmu *hw_perf_event_init(str n = collect_events(event->group_leader, ppmu->n_counter - 1, ctrs, events, cflags); if (n < 0) - return ERR_PTR(-EINVAL); + return -EINVAL; } events[n] = ev; ctrs[n] = event; cflags[n] = flags; if (check_excludes(ctrs, cflags, n, 1)) - return ERR_PTR(-EINVAL); + return -EINVAL; cpuhw = &get_cpu_var(cpu_hw_events); err = power_check_constraints(cpuhw, events, cflags, n + 1); put_cpu_var(cpu_hw_events); if (err) - return ERR_PTR(-EINVAL); + return -EINVAL; event->hw.config = events[n]; event->hw.event_base = cflags[n]; event->hw.last_period = event->hw.sample_period; - atomic64_set(&event->hw.period_left, event->hw.last_period); + local64_set(&event->hw.period_left, event->hw.last_period); /* * See if we need to reserve the PMU. @@ -1121,74 +1341,91 @@ const struct pmu *hw_perf_event_init(str } event->destroy = hw_perf_event_destroy; - if (err) - return ERR_PTR(err); - return &power_pmu; + return err; } +static int power_pmu_event_idx(struct perf_event *event) +{ + return event->hw.idx; +} + +struct pmu power_pmu = { + .pmu_enable = power_pmu_enable, + .pmu_disable = power_pmu_disable, + .event_init = power_pmu_event_init, + .add = power_pmu_add, + .del = power_pmu_del, + .start = power_pmu_start, + .stop = power_pmu_stop, + .read = power_pmu_read, + .start_txn = power_pmu_start_txn, + .cancel_txn = power_pmu_cancel_txn, + .commit_txn = power_pmu_commit_txn, + .event_idx = power_pmu_event_idx, +}; + + /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled * here so there is no possibility of being interrupted. */ static void record_and_restart(struct perf_event *event, unsigned long val, - struct pt_regs *regs, int nmi) + struct pt_regs *regs) { u64 period = event->hw.sample_period; s64 prev, delta, left; int record = 0; + if (event->hw.state & PERF_HES_STOPPED) { + write_pmc(event->hw.idx, 0); + return; + } + /* we don't have to worry about interrupts here */ - prev = atomic64_read(&event->hw.prev_count); - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &event->count); + prev = local64_read(&event->hw.prev_count); + delta = check_and_compute_delta(prev, val); + local64_add(delta, &event->count); /* * See if the total period for this event has expired, * and update for the next period. */ val = 0; - left = atomic64_read(&event->hw.period_left) - delta; + left = local64_read(&event->hw.period_left) - delta; + if (delta == 0) + left++; if (period) { if (left <= 0) { left += period; if (left <= 0) left = period; - record = 1; + record = siar_valid(regs); + event->hw.last_period = event->hw.sample_period; } if (left < 0x80000000LL) val = 0x80000000LL - left; } + write_pmc(event->hw.idx, val); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + /* * Finally record data if requested. */ if (record) { - struct perf_sample_data data = { - .addr = 0, - .period = event->hw.last_period, - }; + struct perf_sample_data data; + + perf_sample_data_init(&data, ~0ULL, event->hw.last_period); if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); - if (perf_event_overflow(event, nmi, &data, regs)) { - /* - * Interrupts are coming too fast - throttle them - * by setting the event to 0, so it will be - * at least 2^30 cycles until the next interrupt - * (assuming each event counts at most 2 counts - * per cycle). - */ - val = 0; - left = ~0ULL >> 1; - } + if (perf_event_overflow(event, &data, regs)) + power_pmu_stop(event, 0); } - - write_pmc(event->hw.idx, val); - atomic64_set(&event->hw.prev_count, val); - atomic64_set(&event->hw.period_left, left); - perf_event_update_userpage(event); } /* @@ -1211,13 +1448,41 @@ unsigned long perf_misc_flags(struct pt_ */ unsigned long perf_instruction_pointer(struct pt_regs *regs) { - unsigned long ip; + bool use_siar = regs_use_siar(regs); - if (TRAP(regs) != 0xf00) - return regs->nip; /* not a PMU interrupt */ + if (use_siar && siar_valid(regs)) + return mfspr(SPRN_SIAR) + perf_ip_adjust(regs); + else if (use_siar) + return 0; // no valid instruction pointer + else + return regs->nip; +} + +static bool pmc_overflow_power7(unsigned long val) +{ + /* + * Events on POWER7 can roll back if a speculative event doesn't + * eventually complete. Unfortunately in some rare cases they will + * raise a performance monitor exception. We need to catch this to + * ensure we reset the PMC. In all cases the PMC will be 256 or less + * cycles from overflow. + * + * We only do this if the first pass fails to find any overflowing + * PMCs because a user might set a period of less than 256 and we + * don't want to mistakenly reset them. + */ + if ((0x80000000 - val) <= 256) + return true; + + return false; +} + +static bool pmc_overflow(unsigned long val) +{ + if ((int)val < 0) + return true; - ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); - return ip; + return false; } /* @@ -1225,11 +1490,11 @@ unsigned long perf_instruction_pointer(s */ static void perf_event_interrupt(struct pt_regs *regs) { - int i; + int i, j; struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); struct perf_event *event; - unsigned long val; - int found = 0; + unsigned long val[8]; + int found, active; int nmi; if (cpuhw->n_limited) @@ -1244,33 +1509,54 @@ static void perf_event_interrupt(struct else irq_enter(); - for (i = 0; i < cpuhw->n_events; ++i) { - event = cpuhw->event[i]; - if (!event->hw.idx || is_limited_pmc(event->hw.idx)) + /* Read all the PMCs since we'll need them a bunch of times */ + for (i = 0; i < ppmu->n_counter; ++i) + val[i] = read_pmc(i + 1); + + /* Try to find what caused the IRQ */ + found = 0; + for (i = 0; i < ppmu->n_counter; ++i) { + if (!pmc_overflow(val[i])) continue; - val = read_pmc(event->hw.idx); - if ((int)val < 0) { - /* event has overflowed */ - found = 1; - record_and_restart(event, val, regs, nmi); + if (is_limited_pmc(i + 1)) + continue; /* these won't generate IRQs */ + /* + * We've found one that's overflowed. For active + * counters we need to log this. For inactive + * counters, we need to reset it anyway + */ + found = 1; + active = 0; + for (j = 0; j < cpuhw->n_events; ++j) { + event = cpuhw->event[j]; + if (event->hw.idx == (i + 1)) { + active = 1; + record_and_restart(event, val[i], regs); + break; + } } + if (!active) + /* reset non active counters that have overflowed */ + write_pmc(i + 1, 0); } - /* - * In case we didn't find and reset the event that caused - * the interrupt, scan all events and reset any that are - * negative, to avoid getting continual interrupts. - * Any that we processed in the previous loop will not be negative. - */ - if (!found) { - for (i = 0; i < ppmu->n_counter; ++i) { - if (is_limited_pmc(i + 1)) + if (!found && __is_processor(PV_POWER7)) { + /* check active counters for special buggy p7 overflow */ + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (!event->hw.idx || is_limited_pmc(event->hw.idx)) continue; - val = read_pmc(i + 1); - if ((int)val < 0) - write_pmc(i + 1, 0); + if (pmc_overflow_power7(val[event->hw.idx - 1])) { + /* event has overflowed in a buggy way*/ + found = 1; + record_and_restart(event, + val[event->hw.idx - 1], + regs); + } } } + if ((!found) && printk_ratelimit()) + printk(KERN_WARNING "Can't find PMC that caused IRQ\n"); /* * Reset MMCR0 to its normal value. This will set PMXE and @@ -1287,7 +1573,7 @@ static void perf_event_interrupt(struct irq_exit(); } -void hw_perf_event_setup(int cpu) +static void power_pmu_setup(int cpu) { struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); @@ -1297,6 +1583,23 @@ void hw_perf_event_setup(int cpu) cpuhw->mmcr[0] = MMCR0_FC; } +static int __cpuinit +power_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + power_pmu_setup(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + int register_power_pmu(struct power_pmu *pmu) { if (ppmu) @@ -1314,5 +1617,8 @@ int register_power_pmu(struct power_pmu freeze_events_kernel = MMCR0_FCHV; #endif /* CONFIG_PPC64 */ + perf_pmu_register(&power_pmu, "cpu", PERF_TYPE_RAW); + perf_cpu_notifier(power_pmu_notifier); + return 0; } diff -uprN linux-2.6.32/arch/powerpc/kernel/perf_event_fsl_emb.c linux-2.6.32.ovz/arch/powerpc/kernel/perf_event_fsl_emb.c --- linux-2.6.32/arch/powerpc/kernel/perf_event_fsl_emb.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/perf_event_fsl_emb.c 2015-03-10 13:23:53.000000000 -0700 @@ -0,0 +1,687 @@ +/* + * Performance event support - Freescale Embedded Performance Monitor + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * Copyright 2010 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct cpu_hw_events { + int n_events; + int disabled; + u8 pmcs_enabled; + struct perf_event *event[MAX_HWEVENTS]; +}; +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +static struct fsl_emb_pmu *ppmu; + +/* Number of perf_events counting hardware events */ +static atomic_t num_events; +/* Used to avoid races in calling reserve/release_pmc_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * If interrupts were soft-disabled when a PMU interrupt occurs, treat + * it as an NMI. + */ +static inline int perf_intr_is_nmi(struct pt_regs *regs) +{ +#ifdef __powerpc64__ + return !regs->softe; +#else + return 0; +#endif +} + +static void perf_event_interrupt(struct pt_regs *regs); + +/* + * Read one performance monitor counter (PMC). + */ +static unsigned long read_pmc(int idx) +{ + unsigned long val; + + switch (idx) { + case 0: + val = mfpmr(PMRN_PMC0); + break; + case 1: + val = mfpmr(PMRN_PMC1); + break; + case 2: + val = mfpmr(PMRN_PMC2); + break; + case 3: + val = mfpmr(PMRN_PMC3); + break; + default: + printk(KERN_ERR "oops trying to read PMC%d\n", idx); + val = 0; + } + return val; +} + +/* + * Write one PMC. + */ +static void write_pmc(int idx, unsigned long val) +{ + switch (idx) { + case 0: + mtpmr(PMRN_PMC0, val); + break; + case 1: + mtpmr(PMRN_PMC1, val); + break; + case 2: + mtpmr(PMRN_PMC2, val); + break; + case 3: + mtpmr(PMRN_PMC3, val); + break; + default: + printk(KERN_ERR "oops trying to write PMC%d\n", idx); + } + + isync(); +} + +/* + * Write one local control A register + */ +static void write_pmlca(int idx, unsigned long val) +{ + switch (idx) { + case 0: + mtpmr(PMRN_PMLCA0, val); + break; + case 1: + mtpmr(PMRN_PMLCA1, val); + break; + case 2: + mtpmr(PMRN_PMLCA2, val); + break; + case 3: + mtpmr(PMRN_PMLCA3, val); + break; + default: + printk(KERN_ERR "oops trying to write PMLCA%d\n", idx); + } + + isync(); +} + +/* + * Write one local control B register + */ +static void write_pmlcb(int idx, unsigned long val) +{ + switch (idx) { + case 0: + mtpmr(PMRN_PMLCB0, val); + break; + case 1: + mtpmr(PMRN_PMLCB1, val); + break; + case 2: + mtpmr(PMRN_PMLCB2, val); + break; + case 3: + mtpmr(PMRN_PMLCB3, val); + break; + default: + printk(KERN_ERR "oops trying to write PMLCB%d\n", idx); + } + + isync(); +} + +static void fsl_emb_pmu_read(struct perf_event *event) +{ + s64 val, delta, prev; + + if (event->hw.state & PERF_HES_STOPPED) + return; + + /* + * Performance monitor interrupts come even when interrupts + * are soft-disabled, as long as interrupts are hard-enabled. + * Therefore we treat them like NMIs. + */ + do { + prev = local64_read(&event->hw.prev_count); + barrier(); + val = read_pmc(event->hw.idx); + } while (local64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + + /* The counters are only 32 bits wide */ + delta = (val - prev) & 0xfffffffful; + local64_add(delta, &event->count); + local64_sub(delta, &event->hw.period_left); +} + +/* + * Disable all events to prevent PMU interrupts and to allow + * events to be added or removed. + */ +static void fsl_emb_pmu_disable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + + if (!cpuhw->disabled) { + cpuhw->disabled = 1; + + /* + * Check if we ever enabled the PMU on this cpu. + */ + if (!cpuhw->pmcs_enabled) { + ppc_enable_pmcs(); + cpuhw->pmcs_enabled = 1; + } + + if (atomic_read(&num_events)) { + /* + * Set the 'freeze all counters' bit, and disable + * interrupts. The barrier is to make sure the + * mtpmr has been executed and the PMU has frozen + * the events before we return. + */ + + mtpmr(PMRN_PMGC0, PMGC0_FAC); + isync(); + } + } + local_irq_restore(flags); +} + +/* + * Re-enable all events if disable == 0. + * If we were previously disabled and events were added, then + * put the new config on the PMU. + */ +static void fsl_emb_pmu_enable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + if (!cpuhw->disabled) + goto out; + + cpuhw->disabled = 0; + ppc_set_pmu_inuse(cpuhw->n_events != 0); + + if (cpuhw->n_events > 0) { + mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE); + isync(); + } + + out: + local_irq_restore(flags); +} + +static int collect_events(struct perf_event *group, int max_count, + struct perf_event *ctrs[]) +{ + int n = 0; + struct perf_event *event; + + if (!is_software_event(group)) { + if (n >= max_count) + return -1; + ctrs[n] = group; + n++; + } + list_for_each_entry(event, &group->sibling_list, group_entry) { + if (!is_software_event(event) && + event->state != PERF_EVENT_STATE_OFF) { + if (n >= max_count) + return -1; + ctrs[n] = event; + n++; + } + } + return n; +} + +/* context locked on entry */ +static int fsl_emb_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw; + int ret = -EAGAIN; + int num_counters = ppmu->n_counter; + u64 val; + int i; + + perf_pmu_disable(event->pmu); + cpuhw = &get_cpu_var(cpu_hw_events); + + if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) + num_counters = ppmu->n_restricted; + + /* + * Allocate counters from top-down, so that restricted-capable + * counters are kept free as long as possible. + */ + for (i = num_counters - 1; i >= 0; i--) { + if (cpuhw->event[i]) + continue; + + break; + } + + if (i < 0) + goto out; + + event->hw.idx = i; + cpuhw->event[i] = event; + ++cpuhw->n_events; + + val = 0; + if (event->hw.sample_period) { + s64 left = local64_read(&event->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + local64_set(&event->hw.prev_count, val); + + if (!(flags & PERF_EF_START)) { + event->hw.state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + val = 0; + } + + write_pmc(i, val); + perf_event_update_userpage(event); + + write_pmlcb(i, event->hw.config >> 32); + write_pmlca(i, event->hw.config_base); + + ret = 0; + out: + put_cpu_var(cpu_hw_events); + perf_pmu_enable(event->pmu); + return ret; +} + +/* context locked on entry */ +static void fsl_emb_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw; + int i = event->hw.idx; + + perf_pmu_disable(event->pmu); + if (i < 0) + goto out; + + fsl_emb_pmu_read(event); + + cpuhw = &get_cpu_var(cpu_hw_events); + + WARN_ON(event != cpuhw->event[event->hw.idx]); + + write_pmlca(i, 0); + write_pmlcb(i, 0); + write_pmc(i, 0); + + cpuhw->event[i] = NULL; + event->hw.idx = -1; + + /* + * TODO: if at least one restricted event exists, and we + * just freed up a non-restricted-capable counter, and + * there is a restricted-capable counter occupied by + * a non-restricted event, migrate that event to the + * vacated counter. + */ + + cpuhw->n_events--; + + out: + perf_pmu_enable(event->pmu); + put_cpu_var(cpu_hw_events); +} + +static void fsl_emb_pmu_start(struct perf_event *event, int ef_flags) +{ + unsigned long flags; + s64 left; + + if (event->hw.idx < 0 || !event->hw.sample_period) + return; + + if (!(event->hw.state & PERF_HES_STOPPED)) + return; + + if (ef_flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + + local_irq_save(flags); + perf_pmu_disable(event->pmu); + + event->hw.state = 0; + left = local64_read(&event->hw.period_left); + write_pmc(event->hw.idx, left); + + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + local_irq_restore(flags); +} + +static void fsl_emb_pmu_stop(struct perf_event *event, int ef_flags) +{ + unsigned long flags; + + if (event->hw.idx < 0 || !event->hw.sample_period) + return; + + if (event->hw.state & PERF_HES_STOPPED) + return; + + local_irq_save(flags); + perf_pmu_disable(event->pmu); + + fsl_emb_pmu_read(event); + event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; + write_pmc(event->hw.idx, 0); + + perf_event_update_userpage(event); + perf_pmu_enable(event->pmu); + local_irq_restore(flags); +} + +/* + * Release the PMU if this is the last perf_event. + */ +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +/* + * Translate a generic cache event_id config to a raw event_id code. + */ +static int hw_perf_cache_event(u64 config, u64 *eventp) +{ + unsigned long type, op, result; + int ev; + + if (!ppmu->cache_events) + return -EINVAL; + + /* unpack config */ + type = config & 0xff; + op = (config >> 8) & 0xff; + result = (config >> 16) & 0xff; + + if (type >= PERF_COUNT_HW_CACHE_MAX || + op >= PERF_COUNT_HW_CACHE_OP_MAX || + result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + ev = (*ppmu->cache_events)[type][op][result]; + if (ev == 0) + return -EOPNOTSUPP; + if (ev == -1) + return -EINVAL; + *eventp = ev; + return 0; +} + +static int fsl_emb_pmu_event_init(struct perf_event *event) +{ + u64 ev; + struct perf_event *events[MAX_HWEVENTS]; + int n; + int err; + int num_restricted; + int i; + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + ev = event->attr.config; + if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) + return -EOPNOTSUPP; + ev = ppmu->generic_events[ev]; + break; + + case PERF_TYPE_HW_CACHE: + err = hw_perf_cache_event(event->attr.config, &ev); + if (err) + return err; + break; + + case PERF_TYPE_RAW: + ev = event->attr.config; + break; + + default: + return -ENOENT; + } + + event->hw.config = ppmu->xlate_event(ev); + if (!(event->hw.config & FSL_EMB_EVENT_VALID)) + return -EINVAL; + + /* + * If this is in a group, check if it can go on with all the + * other hardware events in the group. We assume the event + * hasn't been linked into its leader's sibling list at this point. + */ + n = 0; + if (event->group_leader != event) { + n = collect_events(event->group_leader, + ppmu->n_counter - 1, events); + if (n < 0) + return -EINVAL; + } + + if (event->hw.config & FSL_EMB_EVENT_RESTRICTED) { + num_restricted = 0; + for (i = 0; i < n; i++) { + if (events[i]->hw.config & FSL_EMB_EVENT_RESTRICTED) + num_restricted++; + } + + if (num_restricted >= ppmu->n_restricted) + return -EINVAL; + } + + event->hw.idx = -1; + + event->hw.config_base = PMLCA_CE | PMLCA_FCM1 | + (u32)((ev << 16) & PMLCA_EVENT_MASK); + + if (event->attr.exclude_user) + event->hw.config_base |= PMLCA_FCU; + if (event->attr.exclude_kernel) + event->hw.config_base |= PMLCA_FCS; + if (event->attr.exclude_idle) + return -ENOTSUPP; + + event->hw.last_period = event->hw.sample_period; + local64_set(&event->hw.period_left, event->hw.last_period); + + /* + * See if we need to reserve the PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * reserve_pmc_hardware or release_pmc_hardware. + */ + err = 0; + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && + reserve_pmc_hardware(perf_event_interrupt)) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + + mtpmr(PMRN_PMGC0, PMGC0_FAC); + isync(); + } + event->destroy = hw_perf_event_destroy; + + return err; +} + +static struct pmu fsl_emb_pmu = { + .pmu_enable = fsl_emb_pmu_enable, + .pmu_disable = fsl_emb_pmu_disable, + .event_init = fsl_emb_pmu_event_init, + .add = fsl_emb_pmu_add, + .del = fsl_emb_pmu_del, + .start = fsl_emb_pmu_start, + .stop = fsl_emb_pmu_stop, + .read = fsl_emb_pmu_read, +}; + +/* + * A counter has overflowed; update its count and record + * things if requested. Note that interrupts are hard-disabled + * here so there is no possibility of being interrupted. + */ +static void record_and_restart(struct perf_event *event, unsigned long val, + struct pt_regs *regs) +{ + u64 period = event->hw.sample_period; + s64 prev, delta, left; + int record = 0; + + if (event->hw.state & PERF_HES_STOPPED) { + write_pmc(event->hw.idx, 0); + return; + } + + /* we don't have to worry about interrupts here */ + prev = local64_read(&event->hw.prev_count); + delta = (val - prev) & 0xfffffffful; + local64_add(delta, &event->count); + + /* + * See if the total period for this event has expired, + * and update for the next period. + */ + val = 0; + left = local64_read(&event->hw.period_left) - delta; + if (period) { + if (left <= 0) { + left += period; + if (left <= 0) + left = period; + record = 1; + } + if (left < 0x80000000LL) + val = 0x80000000LL - left; + } + + write_pmc(event->hw.idx, val); + local64_set(&event->hw.prev_count, val); + local64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + + /* + * Finally record data if requested. + */ + if (record) { + struct perf_sample_data data; + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + + if (perf_event_overflow(event, &data, regs)) + fsl_emb_pmu_stop(event, 0); + } +} + +static void perf_event_interrupt(struct pt_regs *regs) +{ + int i; + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + unsigned long val; + int found = 0; + int nmi; + + nmi = perf_intr_is_nmi(regs); + if (nmi) + nmi_enter(); + else + irq_enter(); + + for (i = 0; i < ppmu->n_counter; ++i) { + event = cpuhw->event[i]; + + val = read_pmc(i); + if ((int)val < 0) { + if (event) { + /* event has overflowed */ + found = 1; + record_and_restart(event, val, regs); + } else { + /* + * Disabled counter is negative, + * reset it just in case. + */ + write_pmc(i, 0); + } + } + } + + /* PMM will keep counters frozen until we return from the interrupt. */ + mtmsr(mfmsr() | MSR_PMM); + mtpmr(PMRN_PMGC0, PMGC0_PMIE | PMGC0_FCECE); + isync(); + + if (nmi) + nmi_exit(); + else + irq_exit(); +} + +void hw_perf_event_setup(int cpu) +{ + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + memset(cpuhw, 0, sizeof(*cpuhw)); +} + +int register_fsl_emb_pmu(struct fsl_emb_pmu *pmu) +{ + if (ppmu) + return -EBUSY; /* something's already registered */ + + ppmu = pmu; + pr_info("%s performance monitor hardware support registered\n", + pmu->name); + + perf_pmu_register(&fsl_emb_pmu, "cpu", PERF_TYPE_RAW); + + return 0; +} diff -uprN linux-2.6.32/arch/powerpc/kernel/power4-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power4-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power4-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power4-pmu.c 2015-03-10 13:23:24.000000000 -0700 @@ -602,6 +602,7 @@ static struct power_pmu power4_pmu = { .n_generic = ARRAY_SIZE(p4_generic_events), .generic_events = p4_generic_events, .cache_events = &power4_cache_events, + .flags = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING, }; static int init_power4_pmu(void) @@ -613,4 +614,4 @@ static int init_power4_pmu(void) return register_power_pmu(&power4_pmu); } -arch_initcall(init_power4_pmu); +early_initcall(init_power4_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power5-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power5-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power5-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power5-pmu.c 2015-03-10 13:24:00.000000000 -0700 @@ -73,10 +73,6 @@ #define MMCR1_PMCSEL_MSK 0x7f /* - * Bits in MMCRA - */ - -/* * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 @@ -390,7 +386,7 @@ static int power5_compute_mmcr(u64 event unsigned int hwc[], unsigned long mmcr[]) { unsigned long mmcr1 = 0; - unsigned long mmcra = 0; + unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; unsigned int pmc, unit, byte, psel; unsigned int ttm, grp; int i, isbus, bit, grsel; @@ -614,6 +610,7 @@ static struct power_pmu power5_pmu = { .n_generic = ARRAY_SIZE(power5_generic_events), .generic_events = power5_generic_events, .cache_events = &power5_cache_events, + .flags = PPMU_HAS_SSLOT, }; static int init_power5_pmu(void) @@ -625,4 +622,4 @@ static int init_power5_pmu(void) return register_power_pmu(&power5_pmu); } -arch_initcall(init_power5_pmu); +early_initcall(init_power5_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power5+-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power5+-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power5+-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power5+-pmu.c 2015-03-10 13:24:00.000000000 -0700 @@ -73,10 +73,6 @@ #define MMCR1_PMCSEL_MSK 0x7f /* - * Bits in MMCRA - */ - -/* * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 @@ -670,7 +666,7 @@ static struct power_pmu power5p_pmu = { .get_alternatives = power5p_get_alternatives, .disable_pmc = power5p_disable_pmc, .limited_pmc_event = power5p_limited_pmc_event, - .flags = PPMU_LIMITED_PMC5_6, + .flags = PPMU_LIMITED_PMC5_6 | PPMU_HAS_SSLOT, .n_generic = ARRAY_SIZE(power5p_generic_events), .generic_events = power5p_generic_events, .cache_events = &power5p_cache_events, @@ -686,4 +682,4 @@ static int init_power5p_pmu(void) return register_power_pmu(&power5p_pmu); } -arch_initcall(init_power5p_pmu); +early_initcall(init_power5p_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power6-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power6-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power6-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power6-pmu.c 2015-03-10 13:22:06.000000000 -0700 @@ -178,7 +178,7 @@ static int p6_compute_mmcr(u64 event[], unsigned int hwc[], unsigned long mmcr[]) { unsigned long mmcr1 = 0; - unsigned long mmcra = 0; + unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; int i; unsigned int pmc, ev, b, u, s, psel; unsigned int ttmset = 0; @@ -544,4 +544,4 @@ static int init_power6_pmu(void) return register_power_pmu(&power6_pmu); } -arch_initcall(init_power6_pmu); +early_initcall(init_power6_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power7-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power7-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power7-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power7-pmu.c 2015-03-10 13:23:51.000000000 -0700 @@ -51,10 +51,6 @@ #define MMCR1_PMCSEL_MSK 0xff /* - * Bits in MMCRA - */ - -/* * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 @@ -230,7 +226,7 @@ static int power7_compute_mmcr(u64 event unsigned int hwc[], unsigned long mmcr[]) { unsigned long mmcr1 = 0; - unsigned long mmcra = 0; + unsigned long mmcra = MMCRA_SDAR_DCACHE_MISS | MMCRA_SDAR_ERAT_MISS; unsigned int pmc, unit, combine, l2sel, psel; unsigned int pmc_inuse = 0; int i; @@ -370,7 +366,10 @@ static int init_power7_pmu(void) strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power7")) return -ENODEV; + if (__is_processor(PVR_POWER7p)) + power7_pmu.flags |= PPMU_SIAR_VALID; + return register_power_pmu(&power7_pmu); } -arch_initcall(init_power7_pmu); +early_initcall(init_power7_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/power8-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/power8-pmu.c --- linux-2.6.32/arch/powerpc/kernel/power8-pmu.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/power8-pmu.c 2015-03-10 13:24:00.000000000 -0700 @@ -0,0 +1,497 @@ +/* + * Performance counter support for POWER8 processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * Copyright 2013 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + + +/* + * Some power8 event codes. + */ +#define PM_CYC 0x0001e +#define PM_GCT_NOSLOT_CYC 0x100f8 +#define PM_CMPLU_STALL 0x4000a +#define PM_INST_CMPL 0x00002 +#define PM_BRU_FIN 0x10068 +#define PM_BR_MPRED_CMPL 0x400f6 + + +/* + * Raw event encoding for POWER8: + * + * 60 56 52 48 44 40 36 32 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ thresh_cmp ] [ thresh_ctl ] + * | + * thresh start/stop OR FAB match -* + * + * 28 24 20 16 12 8 4 0 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ ] [ sample ] [cache] [ pmc ] [unit ] c m [ pmcxsel ] + * | | | | | + * | | | | *- mark + * | | *- L1/L2/L3 cache_sel | + * | | | + * | *- sampling mode for marked events *- combine + * | + * *- thresh_sel + * + * Below uses IBM bit numbering. + * + * MMCR1[x:y] = unit (PMCxUNIT) + * MMCR1[x] = combine (PMCxCOMB) + * + * if pmc == 3 and unit == 0 and pmcxsel[0:6] == 0b0101011 + * # PM_MRK_FAB_RSP_MATCH + * MMCR1[20:27] = thresh_ctl (FAB_CRESP_MATCH / FAB_TYPE_MATCH) + * else if pmc == 4 and unit == 0xf and pmcxsel[0:6] == 0b0101001 + * # PM_MRK_FAB_RSP_MATCH_CYC + * MMCR1[20:27] = thresh_ctl (FAB_CRESP_MATCH / FAB_TYPE_MATCH) + * else + * MMCRA[48:55] = thresh_ctl (THRESH START/END) + * + * if thresh_sel: + * MMCRA[45:47] = thresh_sel + * + * if thresh_cmp: + * MMCRA[22:24] = thresh_cmp[0:2] + * MMCRA[25:31] = thresh_cmp[3:9] + * + * if unit == 6 or unit == 7 + * MMCRC[53:55] = cache_sel[1:3] (L2EVENT_SEL) + * else if unit == 8 or unit == 9: + * if cache_sel[0] == 0: # L3 bank + * MMCRC[47:49] = cache_sel[1:3] (L3EVENT_SEL0) + * else if cache_sel[0] == 1: + * MMCRC[50:51] = cache_sel[2:3] (L3EVENT_SEL1) + * else if cache_sel[1]: # L1 event + * MMCR1[16] = cache_sel[2] + * MMCR1[17] = cache_sel[3] + * + * if mark: + * MMCRA[63] = 1 (SAMPLE_ENABLE) + * MMCRA[57:59] = sample[0:2] (RAND_SAMP_ELIG) + * MMCRA[61:62] = sample[3:4] (RAND_SAMP_MODE) + * + */ + +#define EVENT_THR_CMP_SHIFT 40 /* Threshold CMP value */ +#define EVENT_THR_CMP_MASK 0x3ff +#define EVENT_THR_CTL_SHIFT 32 /* Threshold control value (start/stop) */ +#define EVENT_THR_CTL_MASK 0xffull +#define EVENT_THR_SEL_SHIFT 29 /* Threshold select value */ +#define EVENT_THR_SEL_MASK 0x7 +#define EVENT_THRESH_SHIFT 29 /* All threshold bits */ +#define EVENT_THRESH_MASK 0x1fffffull +#define EVENT_SAMPLE_SHIFT 24 /* Sampling mode & eligibility */ +#define EVENT_SAMPLE_MASK 0x1f +#define EVENT_CACHE_SEL_SHIFT 20 /* L2/L3 cache select */ +#define EVENT_CACHE_SEL_MASK 0xf +#define EVENT_IS_L1 (4 << EVENT_CACHE_SEL_SHIFT) +#define EVENT_PMC_SHIFT 16 /* PMC number (1-based) */ +#define EVENT_PMC_MASK 0xf +#define EVENT_UNIT_SHIFT 12 /* Unit */ +#define EVENT_UNIT_MASK 0xf +#define EVENT_COMBINE_SHIFT 11 /* Combine bit */ +#define EVENT_COMBINE_MASK 0x1 +#define EVENT_MARKED_SHIFT 8 /* Marked bit */ +#define EVENT_MARKED_MASK 0x1 +#define EVENT_IS_MARKED (EVENT_MARKED_MASK << EVENT_MARKED_SHIFT) +#define EVENT_PSEL_MASK 0xff /* PMCxSEL value */ + +/* + * Layout of constraint bits: + * + * 60 56 52 48 44 40 36 32 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ fab_match ] [ thresh_cmp ] [ thresh_ctl ] [ ] + * | + * thresh_sel -* + * + * 28 24 20 16 12 8 4 0 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * [ ] [ sample ] [ ] [6] [5] [4] [3] [2] [1] + * | | + * L1 I/D qualifier -* | Count of events for each PMC. + * | p1, p2, p3, p4, p5, p6. + * nc - number of counters -* + * + * The PMC fields P1..P6, and NC, are adder fields. As we accumulate constraints + * we want the low bit of each field to be added to any existing value. + * + * Everything else is a value field. + */ + +#define CNST_FAB_MATCH_VAL(v) (((v) & EVENT_THR_CTL_MASK) << 56) +#define CNST_FAB_MATCH_MASK CNST_FAB_MATCH_VAL(EVENT_THR_CTL_MASK) + +/* We just throw all the threshold bits into the constraint */ +#define CNST_THRESH_VAL(v) (((v) & EVENT_THRESH_MASK) << 32) +#define CNST_THRESH_MASK CNST_THRESH_VAL(EVENT_THRESH_MASK) + +#define CNST_L1_QUAL_VAL(v) (((v) & 3) << 22) +#define CNST_L1_QUAL_MASK CNST_L1_QUAL_VAL(3) + +#define CNST_SAMPLE_VAL(v) (((v) & EVENT_SAMPLE_MASK) << 16) +#define CNST_SAMPLE_MASK CNST_SAMPLE_VAL(EVENT_SAMPLE_MASK) + +/* + * For NC we are counting up to 4 events. This requires three bits, and we need + * the fifth event to overflow and set the 4th bit. To achieve that we bias the + * fields by 3 in test_adder. + */ +#define CNST_NC_SHIFT 12 +#define CNST_NC_VAL (1 << CNST_NC_SHIFT) +#define CNST_NC_MASK (8 << CNST_NC_SHIFT) +#define POWER8_TEST_ADDER (3 << CNST_NC_SHIFT) + +/* + * For the per-PMC fields we have two bits. The low bit is added, so if two + * events ask for the same PMC the sum will overflow, setting the high bit, + * indicating an error. So our mask sets the high bit. + */ +#define CNST_PMC_SHIFT(pmc) ((pmc - 1) * 2) +#define CNST_PMC_VAL(pmc) (1 << CNST_PMC_SHIFT(pmc)) +#define CNST_PMC_MASK(pmc) (2 << CNST_PMC_SHIFT(pmc)) + +/* Our add_fields is defined as: */ +#define POWER8_ADD_FIELDS \ + CNST_PMC_VAL(1) | CNST_PMC_VAL(2) | CNST_PMC_VAL(3) | \ + CNST_PMC_VAL(4) | CNST_PMC_VAL(5) | CNST_PMC_VAL(6) | CNST_NC_VAL + + +/* Bits in MMCR1 for POWER8 */ +#define MMCR1_UNIT_SHIFT(pmc) (60 - (4 * ((pmc) - 1))) +#define MMCR1_COMBINE_SHIFT(pmc) (35 - ((pmc) - 1)) +#define MMCR1_PMCSEL_SHIFT(pmc) (24 - (((pmc) - 1)) * 8) +#define MMCR1_DC_QUAL_SHIFT 47 +#define MMCR1_IC_QUAL_SHIFT 46 + +/* Bits in MMCRA for POWER8 */ +#define MMCRA_SAMP_MODE_SHIFT 1 +#define MMCRA_SAMP_ELIG_SHIFT 4 +#define MMCRA_THR_CTL_SHIFT 8 +#define MMCRA_THR_SEL_SHIFT 16 +#define MMCRA_THR_CMP_SHIFT 32 +#define MMCRA_SDAR_MODE_TLB (1ull << 42) + + +static inline bool event_is_fab_match(u64 event) +{ + /* Only check pmc, unit and pmcxsel, ignore the edge bit (0) */ + event &= 0xff0fe; + + /* PM_MRK_FAB_RSP_MATCH & PM_MRK_FAB_RSP_MATCH_CYC */ + return (event == 0x30056 || event == 0x4f052); +} + +static int power8_get_constraint(u64 event, unsigned long *maskp, unsigned long *valp) +{ + unsigned int unit, pmc, cache; + unsigned long mask, value; + + mask = value = 0; + + pmc = (event >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; + unit = (event >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; + cache = (event >> EVENT_CACHE_SEL_SHIFT) & EVENT_CACHE_SEL_MASK; + + if (pmc) { + if (pmc > 6) + return -1; + + mask |= CNST_PMC_MASK(pmc); + value |= CNST_PMC_VAL(pmc); + + if (pmc >= 5 && event != 0x500fa && event != 0x600f4) + return -1; + } + + if (pmc <= 4) { + /* + * Add to number of counters in use. Note this includes events with + * a PMC of 0 - they still need a PMC, it's just assigned later. + * Don't count events on PMC 5 & 6, there is only one valid event + * on each of those counters, and they are handled above. + */ + mask |= CNST_NC_MASK; + value |= CNST_NC_VAL; + } + + if (unit >= 6 && unit <= 9) { + /* + * L2/L3 events contain a cache selector field, which is + * supposed to be programmed into MMCRC. However MMCRC is only + * HV writable, and there is no API for guest kernels to modify + * it. The solution is for the hypervisor to initialise the + * field to zeroes, and for us to only ever allow events that + * have a cache selector of zero. + */ + if (cache) + return -1; + + } else if (event & EVENT_IS_L1) { + mask |= CNST_L1_QUAL_MASK; + value |= CNST_L1_QUAL_VAL(cache); + } + + if (event & EVENT_IS_MARKED) { + mask |= CNST_SAMPLE_MASK; + value |= CNST_SAMPLE_VAL(event >> EVENT_SAMPLE_SHIFT); + } + + /* + * Special case for PM_MRK_FAB_RSP_MATCH and PM_MRK_FAB_RSP_MATCH_CYC, + * the threshold control bits are used for the match value. + */ + if (event_is_fab_match(event)) { + mask |= CNST_FAB_MATCH_MASK; + value |= CNST_FAB_MATCH_VAL(event >> EVENT_THR_CTL_SHIFT); + } else { + /* + * Check the mantissa upper two bits are not zero, unless the + * exponent is also zero. See the THRESH_CMP_MANTISSA doc. + */ + unsigned int cmp, exp; + + cmp = (event >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK; + exp = cmp >> 7; + + if (exp && (cmp & 0x60) == 0) + return -1; + + mask |= CNST_THRESH_MASK; + value |= CNST_THRESH_VAL(event >> EVENT_THRESH_SHIFT); + } + + *maskp = mask; + *valp = value; + + return 0; +} + +static int power8_compute_mmcr(u64 event[], int n_ev, + unsigned int hwc[], unsigned long mmcr[]) +{ + unsigned long mmcra, mmcr1, unit, combine, psel, cache, val; + unsigned int pmc, pmc_inuse; + int i; + + pmc_inuse = 0; + + /* First pass to count resource use */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; + if (pmc) + pmc_inuse |= 1 << pmc; + } + + /* In continous sampling mode, update SDAR on TLB miss */ + mmcra = MMCRA_SDAR_MODE_TLB; + mmcr1 = 0; + + /* Second pass: assign PMCs, set all MMCR1 fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> EVENT_PMC_SHIFT) & EVENT_PMC_MASK; + unit = (event[i] >> EVENT_UNIT_SHIFT) & EVENT_UNIT_MASK; + combine = (event[i] >> EVENT_COMBINE_SHIFT) & EVENT_COMBINE_MASK; + psel = event[i] & EVENT_PSEL_MASK; + + if (!pmc) { + for (pmc = 1; pmc <= 4; ++pmc) { + if (!(pmc_inuse & (1 << pmc))) + break; + } + + pmc_inuse |= 1 << pmc; + } + + if (pmc <= 4) { + mmcr1 |= unit << MMCR1_UNIT_SHIFT(pmc); + mmcr1 |= combine << MMCR1_COMBINE_SHIFT(pmc); + mmcr1 |= psel << MMCR1_PMCSEL_SHIFT(pmc); + } + + if (event[i] & EVENT_IS_L1) { + cache = event[i] >> EVENT_CACHE_SEL_SHIFT; + mmcr1 |= (cache & 1) << MMCR1_IC_QUAL_SHIFT; + cache >>= 1; + mmcr1 |= (cache & 1) << MMCR1_DC_QUAL_SHIFT; + } + + if (event[i] & EVENT_IS_MARKED) { + mmcra |= MMCRA_SAMPLE_ENABLE; + + val = (event[i] >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; + if (val) { + mmcra |= (val & 3) << MMCRA_SAMP_MODE_SHIFT; + mmcra |= (val >> 2) << MMCRA_SAMP_ELIG_SHIFT; + } + } + + /* + * PM_MRK_FAB_RSP_MATCH and PM_MRK_FAB_RSP_MATCH_CYC, + * the threshold bits are used for the match value. + */ + if (event_is_fab_match(event[i])) { + mmcr1 |= (event[i] >> EVENT_THR_CTL_SHIFT) & + EVENT_THR_CTL_MASK; + } else { + val = (event[i] >> EVENT_THR_CTL_SHIFT) & EVENT_THR_CTL_MASK; + mmcra |= val << MMCRA_THR_CTL_SHIFT; + val = (event[i] >> EVENT_THR_SEL_SHIFT) & EVENT_THR_SEL_MASK; + mmcra |= val << MMCRA_THR_SEL_SHIFT; + val = (event[i] >> EVENT_THR_CMP_SHIFT) & EVENT_THR_CMP_MASK; + mmcra |= val << MMCRA_THR_CMP_SHIFT; + } + + hwc[i] = pmc - 1; + } + + /* Return MMCRx values */ + mmcr[0] = 0; + + /* pmc_inuse is 1-based */ + if (pmc_inuse & 2) + mmcr[0] = MMCR0_PMC1CE; + + if (pmc_inuse & 0x7c) + mmcr[0] |= MMCR0_PMCjCE; + + mmcr[1] = mmcr1; + mmcr[2] = mmcra; + + return 0; +} + +#define MAX_ALT 2 + +/* Table of alternatives, sorted by column 0 */ +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x10134, 0x301e2 }, /* PM_MRK_ST_CMPL */ + { 0x10138, 0x40138 }, /* PM_BR_MRK_2PATH */ + { 0x18082, 0x3e05e }, /* PM_L3_CO_MEPF */ + { 0x1d14e, 0x401e8 }, /* PM_MRK_DATA_FROM_L2MISS */ + { 0x1e054, 0x4000a }, /* PM_CMPLU_STALL */ + { 0x20036, 0x40036 }, /* PM_BR_2PATH */ + { 0x200f2, 0x300f2 }, /* PM_INST_DISP */ + { 0x200f4, 0x600f4 }, /* PM_RUN_CYC */ + { 0x2013c, 0x3012e }, /* PM_MRK_FILT_MATCH */ + { 0x3e054, 0x400f0 }, /* PM_LD_MISS_L1 */ + { 0x400fa, 0x500fa }, /* PM_RUN_INST_CMPL */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(u64 event) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + break; + + for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) + if (event == event_alternatives[i][j]) + return i; + } + + return -1; +} + +static int power8_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ + int i, j, num_alt = 0; + u64 alt_event; + + alt[num_alt++] = event; + + i = find_alternative(event); + if (i >= 0) { + /* Filter out the original event, it's already in alt[0] */ + for (j = 0; j < MAX_ALT; ++j) { + alt_event = event_alternatives[i][j]; + if (alt_event && alt_event != event) + alt[num_alt++] = alt_event; + } + } + + if (flags & PPMU_ONLY_COUNT_RUN) { + /* + * We're only counting in RUN state, so PM_CYC is equivalent to + * PM_RUN_CYC and PM_INST_CMPL === PM_RUN_INST_CMPL. + */ + j = num_alt; + for (i = 0; i < num_alt; ++i) { + switch (alt[i]) { + case 0x1e: /* PM_CYC */ + alt[j++] = 0x600f4; /* PM_RUN_CYC */ + break; + case 0x600f4: /* PM_RUN_CYC */ + alt[j++] = 0x1e; + break; + case 0x2: /* PM_PPC_CMPL */ + alt[j++] = 0x500fa; /* PM_RUN_INST_CMPL */ + break; + case 0x500fa: /* PM_RUN_INST_CMPL */ + alt[j++] = 0x2; /* PM_PPC_CMPL */ + break; + } + } + num_alt = j; + } + + return num_alt; +} + +static void power8_disable_pmc(unsigned int pmc, unsigned long mmcr[]) +{ + if (pmc <= 3) + mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1)); +} + +static int power8_generic_events[] = { + [PERF_COUNT_HW_CPU_CYCLES] = PM_CYC, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = PM_GCT_NOSLOT_CYC, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = PM_CMPLU_STALL, + [PERF_COUNT_HW_INSTRUCTIONS] = PM_INST_CMPL, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = PM_BRU_FIN, + [PERF_COUNT_HW_BRANCH_MISSES] = PM_BR_MPRED_CMPL, +}; + +static struct power_pmu power8_pmu = { + .name = "POWER8", + .n_counter = 6, + .max_alternatives = MAX_ALT + 1, + .add_fields = POWER8_ADD_FIELDS, + .test_adder = POWER8_TEST_ADDER, + .compute_mmcr = power8_compute_mmcr, + .get_constraint = power8_get_constraint, + .get_alternatives = power8_get_alternatives, + .disable_pmc = power8_disable_pmc, + .flags = PPMU_HAS_SSLOT | PPMU_HAS_SIER, + .n_generic = ARRAY_SIZE(power8_generic_events), + .generic_events = power8_generic_events, +}; + +static int __init init_power8_pmu(void) +{ + if (!cur_cpu_spec->oprofile_cpu_type || + strcmp(cur_cpu_spec->oprofile_cpu_type, "ppc64/power8")) + return -ENODEV; + + return register_power_pmu(&power8_pmu); +} +early_initcall(init_power8_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/ppc970-pmu.c linux-2.6.32.ovz/arch/powerpc/kernel/ppc970-pmu.c --- linux-2.6.32/arch/powerpc/kernel/ppc970-pmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/ppc970-pmu.c 2015-03-10 13:23:24.000000000 -0700 @@ -84,10 +84,6 @@ static short mmcr1_adder_bits[8] = { }; /* - * Bits in MMCRA - */ - -/* * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 @@ -173,9 +169,11 @@ static int p970_marked_instr_event(u64 e switch (unit) { case PM_VPU: mask = 0x4c; /* byte 0 bits 2,3,6 */ + break; case PM_LSU0: /* byte 2 bits 0,2,3,4,6; all of byte 1 */ mask = 0x085dff00; + break; case PM_LSU1L: mask = 0x50 << 24; /* byte 3 bits 4,6 */ break; @@ -484,6 +482,7 @@ static struct power_pmu ppc970_pmu = { .n_generic = ARRAY_SIZE(ppc970_generic_events), .generic_events = ppc970_generic_events, .cache_events = &ppc970_cache_events, + .flags = PPMU_NO_SIPR | PPMU_NO_CONT_SAMPLING, }; static int init_ppc970_pmu(void) @@ -496,4 +495,4 @@ static int init_ppc970_pmu(void) return register_power_pmu(&ppc970_pmu); } -arch_initcall(init_ppc970_pmu); +early_initcall(init_ppc970_pmu); diff -uprN linux-2.6.32/arch/powerpc/kernel/ppc_ksyms.c linux-2.6.32.ovz/arch/powerpc/kernel/ppc_ksyms.c --- linux-2.6.32/arch/powerpc/kernel/ppc_ksyms.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/ppc_ksyms.c 2015-03-10 13:23:55.000000000 -0700 @@ -54,7 +54,6 @@ extern void single_step_exception(struct extern int sys_sigreturn(struct pt_regs *regs); EXPORT_SYMBOL(clear_pages); -EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(ISA_DMA_THRESHOLD); EXPORT_SYMBOL(DMA_MODE_READ); EXPORT_SYMBOL(DMA_MODE_WRITE); @@ -88,9 +87,7 @@ EXPORT_SYMBOL(__copy_tofrom_user); EXPORT_SYMBOL(__clear_user); EXPORT_SYMBOL(__strncpy_from_user); EXPORT_SYMBOL(__strnlen_user); -#ifdef CONFIG_PPC64 -EXPORT_SYMBOL(copy_4K_page); -#endif +EXPORT_SYMBOL(copy_page); #if defined(CONFIG_PCI) && defined(CONFIG_PPC32) EXPORT_SYMBOL(isa_io_base); diff -uprN linux-2.6.32/arch/powerpc/kernel/process.c linux-2.6.32.ovz/arch/powerpc/kernel/process.c --- linux-2.6.32/arch/powerpc/kernel/process.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/process.c 2015-03-10 13:24:12.000000000 -0700 @@ -402,7 +402,6 @@ struct task_struct *__switch_to(struct t account_system_vtime(current); account_process_vtime(current); - calculate_steal_time(); /* * We can't take a PMU exception inside _switch() since there is a @@ -554,18 +553,6 @@ void exit_thread(void) void flush_thread(void) { -#ifdef CONFIG_PPC64 - struct thread_info *t = current_thread_info(); - - if (test_ti_thread_flag(t, TIF_ABI_PENDING)) { - clear_ti_thread_flag(t, TIF_ABI_PENDING); - if (test_ti_thread_flag(t, TIF_32BIT)) - clear_ti_thread_flag(t, TIF_32BIT); - else - set_ti_thread_flag(t, TIF_32BIT); - } -#endif - discard_lazy_cpu_state(); if (current->thread.dabr) { @@ -922,7 +909,7 @@ int sys_execve(unsigned long a0, unsigne struct pt_regs *regs) { int error; - char *filename; + struct filename *filename; filename = getname((char __user *) a0); error = PTR_ERR(filename); @@ -931,7 +918,7 @@ int sys_execve(unsigned long a0, unsigne flush_fp_to_thread(current); flush_altivec_to_thread(current); flush_spe_to_thread(current); - error = do_execve(filename, (char __user * __user *) a1, + error = do_execve(filename->name, (char __user * __user *) a1, (char __user * __user *) a2, regs); putname(filename); out: @@ -1120,11 +1107,11 @@ void ppc64_runlatch_off(void) static struct kmem_cache *thread_info_cache; -struct thread_info *alloc_thread_info(struct task_struct *tsk) +struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { struct thread_info *ti; - ti = kmem_cache_alloc(thread_info_cache, GFP_KERNEL); + ti = kmem_cache_alloc_node(thread_info_cache, GFP_KERNEL, node); if (unlikely(ti == NULL)) return NULL; #ifdef CONFIG_DEBUG_STACK_USAGE @@ -1201,3 +1188,14 @@ unsigned long randomize_et_dyn(unsigned return ret; } + +#ifdef CONFIG_SMP +int arch_sd_sibling_asym_packing(void) +{ + if (cpu_has_feature(CPU_FTR_ASYM_SMT)) { + printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n"); + return SD_ASYM_PACKING; + } + return 0; +} +#endif diff -uprN linux-2.6.32/arch/powerpc/kernel/prom.c linux-2.6.32.ovz/arch/powerpc/kernel/prom.c --- linux-2.6.32/arch/powerpc/kernel/prom.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/prom.c 2015-03-10 13:23:48.000000000 -0700 @@ -51,8 +51,8 @@ #include #include #include -#include #include +#include #include #ifdef DEBUG @@ -426,7 +426,7 @@ static int __init early_parse_mem(char * return 1; memory_limit = PAGE_ALIGN(memparse(p, &p)); - DBG("memory limit = 0x%llx\n", (unsigned long long)memory_limit); + DBG("memory limit = 0x%llx\n", memory_limit); return 0; } @@ -1077,87 +1077,6 @@ static void __init early_reserve_mem(voi } } -#ifdef CONFIG_PHYP_DUMP -/** - * phyp_dump_calculate_reserve_size() - reserve variable boot area 5% or arg - * - * Function to find the largest size we need to reserve - * during early boot process. - * - * It either looks for boot param and returns that OR - * returns larger of 256 or 5% rounded down to multiples of 256MB. - * - */ -static inline unsigned long phyp_dump_calculate_reserve_size(void) -{ - unsigned long tmp; - - if (phyp_dump_info->reserve_bootvar) - return phyp_dump_info->reserve_bootvar; - - /* divide by 20 to get 5% of value */ - tmp = lmb_end_of_DRAM(); - do_div(tmp, 20); - - /* round it down in multiples of 256 */ - tmp = tmp & ~0x0FFFFFFFUL; - - return (tmp > PHYP_DUMP_RMR_END ? tmp : PHYP_DUMP_RMR_END); -} - -/** - * phyp_dump_reserve_mem() - reserve all not-yet-dumped mmemory - * - * This routine may reserve memory regions in the kernel only - * if the system is supported and a dump was taken in last - * boot instance or if the hardware is supported and the - * scratch area needs to be setup. In other instances it returns - * without reserving anything. The memory in case of dump being - * active is freed when the dump is collected (by userland tools). - */ -static void __init phyp_dump_reserve_mem(void) -{ - unsigned long base, size; - unsigned long variable_reserve_size; - - if (!phyp_dump_info->phyp_dump_configured) { - printk(KERN_ERR "Phyp-dump not supported on this hardware\n"); - return; - } - - if (!phyp_dump_info->phyp_dump_at_boot) { - printk(KERN_INFO "Phyp-dump disabled at boot time\n"); - return; - } - - variable_reserve_size = phyp_dump_calculate_reserve_size(); - - if (phyp_dump_info->phyp_dump_is_active) { - /* Reserve *everything* above RMR.Area freed by userland tools*/ - base = variable_reserve_size; - size = lmb_end_of_DRAM() - base; - - /* XXX crashed_ram_end is wrong, since it may be beyond - * the memory_limit, it will need to be adjusted. */ - lmb_reserve(base, size); - - phyp_dump_info->init_reserve_start = base; - phyp_dump_info->init_reserve_size = size; - } else { - size = phyp_dump_info->cpu_state_size + - phyp_dump_info->hpte_region_size + - variable_reserve_size; - base = lmb_end_of_DRAM() - size; - lmb_reserve(base, size); - phyp_dump_info->init_reserve_start = base; - phyp_dump_info->init_reserve_size = size; - } -} -#else -static inline void __init phyp_dump_reserve_mem(void) {} -#endif /* CONFIG_PHYP_DUMP && CONFIG_PPC_RTAS */ - - void __init early_init_devtree(void *params) { phys_addr_t limit; @@ -1172,9 +1091,9 @@ void __init early_init_devtree(void *par of_scan_flat_dt(early_init_dt_scan_rtas, NULL); #endif -#ifdef CONFIG_PHYP_DUMP - /* scan tree to see if dump occured during last boot */ - of_scan_flat_dt(early_init_dt_scan_phyp_dump, NULL); +#ifdef CONFIG_FA_DUMP + /* scan tree to see if dump is active during last boot */ + of_scan_flat_dt(early_init_dt_scan_fw_dump, NULL); #endif /* Retrieve various informations from the /chosen node of the @@ -1198,9 +1117,15 @@ void __init early_init_devtree(void *par if (PHYSICAL_START > MEMORY_START) lmb_reserve(MEMORY_START, 0x8000); reserve_kdump_trampoline(); - reserve_crashkernel(); +#ifdef CONFIG_FA_DUMP + /* + * If we fail to reserve memory for firmware-assisted dump then + * fallback to kexec based kdump. + */ + if (fadump_reserve_mem() == 0) +#endif + reserve_crashkernel(); early_reserve_mem(); - phyp_dump_reserve_mem(); limit = memory_limit; if (! limit) { diff -uprN linux-2.6.32/arch/powerpc/kernel/prom_init.c linux-2.6.32.ovz/arch/powerpc/kernel/prom_init.c --- linux-2.6.32/arch/powerpc/kernel/prom_init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/prom_init.c 2015-03-10 13:23:55.000000000 -0700 @@ -635,6 +635,9 @@ static void __init early_cmdline_parse(v #define OV3_VMX 0x40 /* VMX/Altivec */ #define OV3_DFP 0x20 /* decimal FP */ +/* Option vector 4: IBM PAPR implementation */ +#define OV4_MIN_ENT_CAP 0x01 /* minimum VP entitled capacity */ + /* Option vector 5: PAPR/OF options supported */ #define OV5_LPAR 0x80 /* logical partitioning supported */ #define OV5_SPLPAR 0x40 /* shared-processor LPAR supported */ @@ -650,9 +653,17 @@ static void __init early_cmdline_parse(v #endif /* CONFIG_PCI_MSI */ #ifdef CONFIG_PPC_SMLPAR #define OV5_CMO 0x80 /* Cooperative Memory Overcommitment */ +#define OV5_XCMO 0x40 /* Page Coalescing */ #else #define OV5_CMO 0x00 +#define OV5_XCMO 0x00 #endif +#define OV5_TYPE1_AFFINITY 0x80 /* Type 1 NUMA affinity */ +#define OV5_PFO_HW_RNG 0x80 /* PFO Random Number Generator */ +#define OV5_PFO_HW_ENCR 0x20 /* PFO Encryption Accelerator */ + +/* Option Vector 6: IBM PAPR hints */ +#define OV6_LINUX 0x02 /* Linux is our OS */ /* * The architecture vector has an array of PVR mask/value pairs, @@ -665,7 +676,7 @@ static unsigned char ibm_architecture_ve W(0xffffffff), W(0x0f000003), /* all 2.06-compliant */ W(0xffffffff), W(0x0f000002), /* all 2.05-compliant */ W(0xfffffffe), W(0x0f000001), /* all 2.04-compliant and earlier */ - 5 - 1, /* 5 option vectors */ + 6 - 1, /* 6 option vectors */ /* option vector 1: processor architectures supported */ 3 - 2, /* length */ @@ -693,16 +704,39 @@ static unsigned char ibm_architecture_ve OV3_FP | OV3_VMX | OV3_DFP, /* option vector 4: IBM PAPR implementation */ - 2 - 2, /* length */ + 3 - 2, /* length */ 0, /* don't halt */ + OV4_MIN_ENT_CAP, /* minimum VP entitled capacity */ /* option vector 5: PAPR/OF options */ - 5 - 2, /* length */ + 18 - 2, /* length */ 0, /* don't ignore, don't halt */ OV5_LPAR | OV5_SPLPAR | OV5_LARGE_PAGES | OV5_DRCONF_MEMORY | OV5_DONATE_DEDICATE_CPU | OV5_MSI, 0, - OV5_CMO, + OV5_CMO | OV5_XCMO, + OV5_TYPE1_AFFINITY, + 0, + 0, + 0, + /* WARNING: The offset of the "number of cores" field below + * must match by the macro below. Update the definition if + * the structure layout changes. + */ +#define IBM_ARCH_VEC_NRCORES_OFFSET 101 + W(NR_CPUS), /* number of cores supported */ + 0, + 0, + 0, + 0, + OV5_PFO_HW_RNG | OV5_PFO_HW_ENCR, + + /* option vector 6: IBM PAPR hints */ + 4 - 2, /* length */ + 0, + 0, + OV6_LINUX, + }; /* Old method - ELF header with PT_NOTE sections */ @@ -792,13 +826,70 @@ static struct fake_elf { } }; +static int __init prom_count_smt_threads(void) +{ + phandle node; + char type[64]; + unsigned int plen; + + /* Pick up th first CPU node we can find */ + for (node = 0; prom_next_node(&node); ) { + type[0] = 0; + prom_getprop(node, "device_type", type, sizeof(type)); + + if (strcmp(type, RELOC("cpu"))) + continue; + /* + * There is an entry for each smt thread, each entry being + * 4 bytes long. All cpus should have the same number of + * smt threads, so return after finding the first. + */ + plen = prom_getproplen(node, "ibm,ppc-interrupt-server#s"); + if (plen == PROM_ERROR) + break; + plen >>= 2; + prom_debug("Found 0x%x smt threads per core\n", (unsigned long)plen); + + /* Sanity check */ + if (plen < 1 || plen > 64) { + prom_printf("Threads per core 0x%x out of bounds, assuming 1\n", + (unsigned long)plen); + return 1; + } + return plen; + } + prom_debug("No threads found, assuming 1 per core\n"); + + return 1; + +} + + static void __init prom_send_capabilities(void) { ihandle elfloader, root; prom_arg_t ret; + u32 *cores; root = call_prom("open", 1, 1, ADDR("/")); if (root != 0) { + /* We need to tell the FW about the number of cores we support. + * + * To do that, we count the number of threads on the first core + * (we assume this is the same for all cores) and use it to + * divide NR_CPUS. + */ + cores = (u32 *)PTRRELOC(&ibm_architecture_vec[IBM_ARCH_VEC_NRCORES_OFFSET]); + if (*cores != NR_CPUS) { + prom_printf("WARNING ! " + "ibm_architecture_vec structure inconsistent: 0x%x !\n", + *cores); + } else { + *cores = NR_CPUS / prom_count_smt_threads(); + prom_printf("Max number of cores passed to firmware: 0x%x\n", + (unsigned long)*cores); + } + /* try calling the ibm,client-architecture-support method */ prom_printf("Calling ibm,client-architecture-support..."); if (call_prom_ret("call-method", 3, 2, &ret, @@ -890,7 +981,7 @@ static unsigned long __init alloc_up(uns } if (addr == 0) return 0; - RELOC(alloc_bottom) = addr; + RELOC(alloc_bottom) = addr + size; prom_debug(" -> %x\n", addr); prom_debug(" alloc_bottom : %x\n", RELOC(alloc_bottom)); @@ -1704,7 +1795,7 @@ static void __init *make_room(unsigned l chunk = alloc_up(room, 0); if (chunk == 0) prom_panic("No memory for flatten_device_tree (claim failed)"); - *mem_end = RELOC(alloc_top); + *mem_end = chunk + room; } ret = (void *)*mem_start; @@ -1923,7 +2014,7 @@ static void __init flatten_device_tree(v mem_start = (unsigned long)alloc_up(room, PAGE_SIZE); if (mem_start == 0) prom_panic("Can't allocate initial device-tree chunk\n"); - mem_end = RELOC(alloc_top); + mem_end = mem_start + room; /* Get root of tree */ root = call_prom("peer", 1, 1, (phandle)0); diff -uprN linux-2.6.32/arch/powerpc/kernel/ptrace.c linux-2.6.32.ovz/arch/powerpc/kernel/ptrace.c --- linux-2.6.32/arch/powerpc/kernel/ptrace.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/ptrace.c 2015-03-10 13:22:23.000000000 -0700 @@ -124,12 +124,16 @@ static int gpr_get(struct task_struct *t unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - int ret; + int i, ret; if (target->thread.regs == NULL) return -EIO; - CHECK_FULL_REGS(target->thread.regs); + if (!FULL_REGS(target->thread.regs)) { + /* We have a partial register set. Fill 14-31 with bogus values */ + for (i = 14; i < 32; i++) + target->thread.regs->gpr[i] = NV_REG_POISON; + } ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, target->thread.regs, @@ -536,11 +540,16 @@ static int gpr32_get(struct task_struct compat_ulong_t *k = kbuf; compat_ulong_t __user *u = ubuf; compat_ulong_t reg; + int i; if (target->thread.regs == NULL) return -EIO; - CHECK_FULL_REGS(target->thread.regs); + if (!FULL_REGS(target->thread.regs)) { + /* We have a partial register set. Fill 14-31 with bogus values */ + for (i = 14; i < 32; i++) + target->thread.regs->gpr[i] = NV_REG_POISON; + } pos /= sizeof(reg); count /= sizeof(reg); @@ -1076,9 +1085,7 @@ void do_syscall_trace_leave(struct pt_re { int step; - if (unlikely(current->audit_context)) - audit_syscall_exit((regs->ccr&0x10000000)?AUDITSC_FAILURE:AUDITSC_SUCCESS, - regs->result); + audit_syscall_exit(regs); step = test_thread_flag(TIF_SINGLESTEP); if (step || test_thread_flag(TIF_SYSCALL_TRACE)) diff -uprN linux-2.6.32/arch/powerpc/kernel/rtas.c linux-2.6.32.ovz/arch/powerpc/kernel/rtas.c --- linux-2.6.32/arch/powerpc/kernel/rtas.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/rtas.c 2015-03-10 13:24:05.000000000 -0700 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -40,20 +41,14 @@ #include #include #include +#include +#include struct rtas_t rtas = { .lock = __RAW_SPIN_LOCK_UNLOCKED }; EXPORT_SYMBOL(rtas); -struct rtas_suspend_me_data { - atomic_t working; /* number of cpus accessing this struct */ - atomic_t done; - int token; /* ibm,suspend-me */ - int error; - struct completion *complete; /* wait on this until working == 0 */ -}; - DEFINE_SPINLOCK(rtas_data_buf_lock); EXPORT_SYMBOL(rtas_data_buf_lock); @@ -500,7 +495,7 @@ unsigned int rtas_busy_delay(int status) might_sleep(); ms = rtas_busy_delay_time(status); - if (ms) + if (ms && need_resched()) msleep(ms); return ms; @@ -690,10 +685,14 @@ void rtas_os_term(char *str) { int status; - if (panic_timeout) - return; - - if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term")) + /* + * Firmware with the ibm,extended-os-term property is guaranteed + * to always return from an ibm,os-term call. Earlier versions without + * this property may terminate the partition which we want to avoid + * since it interferes with panic_timeout. + */ + if (RTAS_UNKNOWN_SERVICE == rtas_token("ibm,os-term") || + RTAS_UNKNOWN_SERVICE == rtas_token("ibm,extended-os-term")) return; snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str); @@ -704,20 +703,64 @@ void rtas_os_term(char *str) } while (rtas_busy_delay(status)); if (status != 0) - printk(KERN_EMERG "ibm,os-term call failed %d\n", - status); + printk(KERN_EMERG "ibm,os-term call failed %d\n", status); } static int ibm_suspend_me_token = RTAS_UNKNOWN_SERVICE; #ifdef CONFIG_PPC_PSERIES -static void rtas_percpu_suspend_me(void *info) +static int __rtas_suspend_last_cpu(struct rtas_suspend_me_data *data, int wake_when_done) +{ + u16 slb_size = mmu_slb_size; + int rc = H_MULTI_THREADS_ACTIVE; + int cpu; + + slb_set_size(SLB_MIN_SIZE); + printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", smp_processor_id()); + + while (rc == H_MULTI_THREADS_ACTIVE && !data->done) { + rc = rtas_call(data->token, 0, 1, NULL); + if (rc && rc != H_MULTI_THREADS_ACTIVE) + printk(KERN_DEBUG "ibm,suspend-me returned %d\n", rc); + } + + smp_rmb(); + if (rc || data->error) + slb_set_size(slb_size); + + if (data->error) + rc = data->error; + + data->error = rc; + + if (wake_when_done) { + smp_wmb(); + data->done = 1; + + /* Ensure data->done is seen on all CPUs that are about to wake up + as a result of the H_PROD below */ + mb(); + + for_each_online_cpu(cpu) + plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + } + + if (atomic_dec_return(&data->working) == 0) + complete(data->complete); + + return rc; +} + +int rtas_suspend_last_cpu(struct rtas_suspend_me_data *data) +{ + atomic_inc(&data->working); + return __rtas_suspend_last_cpu(data, 0); +} + +static int __rtas_suspend_cpu(struct rtas_suspend_me_data *data, int wake_when_done) { long rc = H_SUCCESS; unsigned long msr_save; - u16 slb_size = mmu_slb_size; int cpu; - struct rtas_suspend_me_data *data = - (struct rtas_suspend_me_data *)info; atomic_inc(&data->working); @@ -725,7 +768,7 @@ static void rtas_percpu_suspend_me(void msr_save = mfmsr(); mtmsr(msr_save & ~(MSR_EE)); - while (rc == H_SUCCESS && !atomic_read(&data->done)) + while (rc == H_SUCCESS && !data->done) rc = plpar_hcall_norets(H_JOIN); mtmsr(msr_save); @@ -737,34 +780,132 @@ static void rtas_percpu_suspend_me(void /* All other cpus are in H_JOIN, this cpu does * the suspend. */ - slb_set_size(SLB_MIN_SIZE); - printk(KERN_DEBUG "calling ibm,suspend-me on cpu %i\n", - smp_processor_id()); - data->error = rtas_call(data->token, 0, 1, NULL); - - if (data->error) { - printk(KERN_DEBUG "ibm,suspend-me returned %d\n", - data->error); - slb_set_size(slb_size); - } - } else { - printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n", - smp_processor_id(), rc); - data->error = rc; + return __rtas_suspend_last_cpu(data, wake_when_done); } - atomic_set(&data->done, 1); - - /* This cpu did the suspend or got an error; in either case, - * we need to prod all other other cpus out of join state. - * Extra prods are harmless. - */ - for_each_online_cpu(cpu) - plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + printk(KERN_ERR "H_JOIN on cpu %i failed with rc = %ld\n", + smp_processor_id(), rc); + data->error = rc; + + if (wake_when_done) { + smp_wmb(); + data->done = 1; + /* Ensure data->done is seen on all CPUs that are about to wake up + as a result of the H_PROD below */ + mb(); + pSeries_coalesce_init(); + + /* This cpu did the suspend or got an error; in either case, + * we need to prod all other other cpus out of join state. + * Extra prods are harmless. + */ + for_each_online_cpu(cpu) + plpar_hcall_norets(H_PROD, get_hard_smp_processor_id(cpu)); + } out: if (atomic_dec_return(&data->working) == 0) complete(data->complete); + return rc; +} + +int rtas_suspend_cpu(struct rtas_suspend_me_data *data) +{ + return __rtas_suspend_cpu(data, 0); +} + +static void rtas_percpu_suspend_me(void *info) +{ + __rtas_suspend_cpu((struct rtas_suspend_me_data *)info, 1); +} + +enum rtas_cpu_state { + DOWN, + UP, +}; + +#ifndef CONFIG_SMP +static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, + cpumask_var_t cpus) +{ + if (!cpumask_empty(cpus)) { + cpumask_clear(cpus); + return -EINVAL; + } else + return 0; +} +#else +/* On return cpumask will be altered to indicate CPUs changed. + * CPUs with states changed will be set in the mask, + * CPUs with status unchanged will be unset in the mask. */ +static int rtas_cpu_state_change_mask(enum rtas_cpu_state state, + cpumask_var_t cpus) +{ + int cpu; + int cpuret = 0; + int ret = 0; + + if (cpumask_empty(cpus)) + return 0; + + for_each_cpu(cpu, cpus) { + switch (state) { + case DOWN: + cpuret = cpu_down(cpu); + break; + case UP: + cpuret = cpu_up(cpu); + break; + } + if (cpuret) { + pr_debug("%s: cpu_%s for cpu#%d returned %d.\n", + __func__, + ((state == UP) ? "up" : "down"), + cpu, cpuret); + if (!ret) + ret = cpuret; + if (state == UP) { + /* clear bits for unchanged cpus, return */ + cpumask_shift_right(cpus, cpus, cpu); + cpumask_shift_left(cpus, cpus, cpu); + break; + } else { + /* clear bit for unchanged cpu, continue */ + cpumask_clear_cpu(cpu, cpus); + } + } + } + + return ret; } +#endif + +int rtas_online_cpus_mask(cpumask_var_t cpus) +{ + int ret; + + ret = rtas_cpu_state_change_mask(UP, cpus); + + if (ret) { + cpumask_var_t tmp_mask; + + if (!alloc_cpumask_var(&tmp_mask, GFP_TEMPORARY)) + return ret; + + /* Use tmp_mask to preserve cpus mask from first failure */ + cpumask_copy(tmp_mask, cpus); + rtas_offline_cpus_mask(tmp_mask); + free_cpumask_var(tmp_mask); + } + + return ret; +} +EXPORT_SYMBOL(rtas_online_cpus_mask); + +int rtas_offline_cpus_mask(cpumask_var_t cpus) +{ + return rtas_cpu_state_change_mask(DOWN, cpus); +} +EXPORT_SYMBOL(rtas_offline_cpus_mask); static int rtas_ibm_suspend_me(struct rtas_args *args) { @@ -773,6 +914,8 @@ static int rtas_ibm_suspend_me(struct rt unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; struct rtas_suspend_me_data data; DECLARE_COMPLETION_ONSTACK(done); + cpumask_var_t offline_mask; + int cpuret; if (!rtas_service_present("ibm,suspend-me")) return -ENOSYS; @@ -796,12 +939,26 @@ static int rtas_ibm_suspend_me(struct rt return 0; } + if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + return -ENOMEM; + atomic_set(&data.working, 0); - atomic_set(&data.done, 0); - data.token = rtas_token("ibm,suspend-me"); + data.done = 0; data.error = 0; + data.token = rtas_token("ibm,suspend-me"); data.complete = &done; + /* All present CPUs must be online */ + cpumask_andnot(offline_mask, cpu_present_mask, cpu_online_mask); + cpuret = rtas_online_cpus_mask(offline_mask); + if (cpuret) { + pr_err("%s: Could not bring present CPUs online.\n", __func__); + data.error = cpuret; + goto out; + } + + stop_topology_update(); + /* Call function on all CPUs. One of us will make the * rtas call */ @@ -813,6 +970,16 @@ static int rtas_ibm_suspend_me(struct rt if (data.error != 0) printk(KERN_ERR "Error doing global join\n"); + start_topology_update(); + + /* Take down CPUs not online prior to suspend */ + cpuret = rtas_offline_cpus_mask(offline_mask); + if (cpuret) + pr_warn("%s: Could not restore CPUs to offline state.\n", + __func__); + +out: + free_cpumask_var(offline_mask); return data.error; } #else /* CONFIG_PPC_PSERIES */ @@ -822,6 +989,40 @@ static int rtas_ibm_suspend_me(struct rt } #endif +/** + * Find a specific pseries error log in an RTAS extended event log. + * @log: RTAS error/event log + * @section_id: two character section identifier + * + * Returns a pointer to the specified errorlog or NULL if not found. + */ +struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log, + uint16_t section_id) +{ + struct rtas_ext_event_log_v6 *ext_log = + (struct rtas_ext_event_log_v6 *)log->buffer; + struct pseries_errorlog *sect; + unsigned char *p, *log_end; + + /* Check that we understand the format */ + if (log->extended_log_length < sizeof(struct rtas_ext_event_log_v6) || + ext_log->log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || + ext_log->company_id != RTAS_V6EXT_COMPANY_ID_IBM) + return NULL; + + log_end = log->buffer + log->extended_log_length; + p = ext_log->vendor_log; + + while (p < log_end) { + sect = (struct pseries_errorlog *)p; + if (sect->id == section_id) + return sect; + p += sect->length; + } + + return NULL; +} + asmlinkage int ppc_rtas(struct rtas_args __user *uargs) { struct rtas_args args; diff -uprN linux-2.6.32/arch/powerpc/kernel/rtas_flash.c linux-2.6.32.ovz/arch/powerpc/kernel/rtas_flash.c --- linux-2.6.32/arch/powerpc/kernel/rtas_flash.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/rtas_flash.c 2015-03-10 13:23:55.000000000 -0700 @@ -56,13 +56,31 @@ #define VALIDATE_READY -1001 /* Firmware image ready for validation */ #define VALIDATE_PARAM_ERR -3 /* RTAS Parameter Error */ #define VALIDATE_HW_ERR -1 /* RTAS Hardware Error */ -#define VALIDATE_TMP_UPDATE 0 /* Validate Return Status */ -#define VALIDATE_FLASH_AUTH 1 /* Validate Return Status */ -#define VALIDATE_INVALID_IMG 2 /* Validate Return Status */ -#define VALIDATE_CUR_UNKNOWN 3 /* Validate Return Status */ -#define VALIDATE_TMP_COMMIT_DL 4 /* Validate Return Status */ -#define VALIDATE_TMP_COMMIT 5 /* Validate Return Status */ -#define VALIDATE_TMP_UPDATE_DL 6 /* Validate Return Status */ + +/* ibm,validate-flash-image update result tokens */ +#define VALIDATE_TMP_UPDATE 0 /* T side will be updated */ +#define VALIDATE_FLASH_AUTH 1 /* Partition does not have authority */ +#define VALIDATE_INVALID_IMG 2 /* Candidate image is not valid */ +#define VALIDATE_CUR_UNKNOWN 3 /* Current fixpack level is unknown */ +/* + * Current T side will be committed to P side before being replace with new + * image, and the new image is downlevel from current image + */ +#define VALIDATE_TMP_COMMIT_DL 4 +/* + * Current T side will be committed to P side before being replaced with new + * image + */ +#define VALIDATE_TMP_COMMIT 5 +/* + * T side will be updated with a downlevel image + */ +#define VALIDATE_TMP_UPDATE_DL 6 +/* + * The candidate image's release date is later than the system's firmware + * service entitlement date - service warranty period has expired + */ +#define VALIDATE_OUT_OF_WRNTY 7 /* ibm,manage-flash-image operation tokens */ #define RTAS_REJECT_TMP_IMG 0 @@ -70,6 +88,7 @@ /* Array sizes */ #define VALIDATE_BUF_SIZE 4096 +#define VALIDATE_MSG_LEN 256 #define RTAS_MSG_MAXLEN 64 /* Quirk - RTAS requires 4k list length and block size */ @@ -93,12 +112,8 @@ struct flash_block_list { struct flash_block_list *next; struct flash_block blocks[FLASH_BLOCKS_PER_NODE]; }; -struct flash_block_list_header { /* just the header of flash_block_list */ - unsigned long num_blocks; - struct flash_block_list *next; -}; -static struct flash_block_list_header rtas_firmware_flash_list = {0, NULL}; +static struct flash_block_list *rtas_firmware_flash_list; /* Use slab cache to guarantee 4k alignment */ static struct kmem_cache *flash_block_cache = NULL; @@ -107,13 +122,14 @@ static struct kmem_cache *flash_block_ca /* Local copy of the flash block list. * We only allow one open of the flash proc file and create this - * list as we go. This list will be put in the - * rtas_firmware_flash_list var once it is fully read. + * list as we go. The rtas_firmware_flash_list varable will be + * set once the data is fully read. * * For convenience as we build the list we use virtual addrs, * we do not fill in the version number, and the length field * is treated as the number of entries currently in the block - * (i.e. not a byte count). This is all fixed on release. + * (i.e. not a byte count). This is all fixed when calling + * the flash routine. */ /* Status int must be first member of struct */ @@ -200,16 +216,16 @@ static int rtas_flash_release(struct ino if (uf->flist) { /* File was opened in write mode for a new flash attempt */ /* Clear saved list */ - if (rtas_firmware_flash_list.next) { - free_flash_list(rtas_firmware_flash_list.next); - rtas_firmware_flash_list.next = NULL; + if (rtas_firmware_flash_list) { + free_flash_list(rtas_firmware_flash_list); + rtas_firmware_flash_list = NULL; } if (uf->status != FLASH_AUTH) uf->status = flash_list_valid(uf->flist); if (uf->status == FLASH_IMG_READY) - rtas_firmware_flash_list.next = uf->flist; + rtas_firmware_flash_list = uf->flist; else free_flash_list(uf->flist); @@ -285,12 +301,6 @@ static ssize_t rtas_flash_read(struct fi return msglen; } -/* constructor for flash_block_cache */ -void rtas_block_ctor(void *ptr) -{ - memset(ptr, 0, RTAS_BLK_SIZE); -} - /* We could be much more efficient here. But to keep this function * simple we allocate a page to the block list no matter how small the * count is. If the system is low on memory it will be just as well @@ -315,7 +325,7 @@ static ssize_t rtas_flash_write(struct f * proc file */ if (uf->flist == NULL) { - uf->flist = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + uf->flist = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!uf->flist) return -ENOMEM; } @@ -326,7 +336,7 @@ static ssize_t rtas_flash_write(struct f next_free = fl->num_blocks; if (next_free == FLASH_BLOCKS_PER_NODE) { /* Need to allocate another block_list */ - fl->next = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + fl->next = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!fl->next) return -ENOMEM; fl = fl->next; @@ -335,7 +345,7 @@ static ssize_t rtas_flash_write(struct f if (count > RTAS_BLK_SIZE) count = RTAS_BLK_SIZE; - p = kmem_cache_alloc(flash_block_cache, GFP_KERNEL); + p = kmem_cache_zalloc(flash_block_cache, GFP_KERNEL); if (!p) return -ENOMEM; @@ -474,7 +484,7 @@ static void validate_flash(struct rtas_v } static int get_validate_flash_msg(struct rtas_validate_flash_t *args_buf, - char *msg) + char *msg, int msglen) { int n; @@ -482,7 +492,8 @@ static int get_validate_flash_msg(struct n = sprintf(msg, "%d\n", args_buf->update_results); if ((args_buf->update_results >= VALIDATE_CUR_UNKNOWN) || (args_buf->update_results == VALIDATE_TMP_UPDATE)) - n += sprintf(msg + n, "%s\n", args_buf->buf); + n += snprintf(msg + n, msglen - n, "%s\n", + args_buf->buf); } else { n = sprintf(msg, "%d\n", args_buf->status); } @@ -494,7 +505,7 @@ static ssize_t validate_flash_read(struc { struct proc_dir_entry *dp = PDE(file->f_path.dentry->d_inode); struct rtas_validate_flash_t *args_buf; - char msg[RTAS_MSG_MAXLEN]; + char msg[VALIDATE_MSG_LEN]; int msglen; args_buf = (struct rtas_validate_flash_t *) dp->data; @@ -502,7 +513,7 @@ static ssize_t validate_flash_read(struc if (ppos && *ppos != 0) return 0; /* be cheap */ - msglen = get_validate_flash_msg(args_buf, msg); + msglen = get_validate_flash_msg(args_buf, msg, VALIDATE_MSG_LEN); if (msglen > count) msglen = count; @@ -592,7 +603,7 @@ static void rtas_flash_firmware(int rebo unsigned long rtas_block_list; int i, status, update_token; - if (rtas_firmware_flash_list.next == NULL) + if (rtas_firmware_flash_list == NULL) return; /* nothing to do */ if (reboot_type != SYS_RESTART) { @@ -609,20 +620,31 @@ static void rtas_flash_firmware(int rebo return; } - /* NOTE: the "first" block list is a global var with no data - * blocks in the kernel data segment. We do this because - * we want to ensure this block_list addr is under 4GB. + /* + * Just before starting the firmware flash, cancel the event scan work + * to avoid any soft lockup issues. */ - rtas_firmware_flash_list.num_blocks = 0; - flist = (struct flash_block_list *)&rtas_firmware_flash_list; + rtas_cancel_event_scan(); + + /* + * NOTE: the "first" block must be under 4GB, so we create + * an entry with no data blocks in the reserved buffer in + * the kernel data segment. + */ + spin_lock(&rtas_data_buf_lock); + flist = (struct flash_block_list *)&rtas_data_buf[0]; + flist->num_blocks = 0; + flist->next = rtas_firmware_flash_list; rtas_block_list = virt_to_abs(flist); if (rtas_block_list >= 4UL*1024*1024*1024) { printk(KERN_ALERT "FLASH: kernel bug...flash list header addr above 4GB\n"); + spin_unlock(&rtas_data_buf_lock); return; } printk(KERN_ALERT "FLASH: preparing saved firmware image for flash\n"); /* Update the block_list in place. */ + rtas_firmware_flash_list = NULL; /* too hard to backout on error */ image_size = 0; for (f = flist; f; f = next) { /* Translate data addrs to absolute */ @@ -663,6 +685,7 @@ static void rtas_flash_firmware(int rebo printk(KERN_ALERT "FLASH: unknown flash return code %d\n", status); break; } + spin_unlock(&rtas_data_buf_lock); } static void remove_flash_pde(struct proc_dir_entry *dp) @@ -798,7 +821,7 @@ static int __init rtas_flash_init(void) flash_block_cache = kmem_cache_create("rtas_flash_cache", RTAS_BLK_SIZE, RTAS_BLK_SIZE, 0, - rtas_block_ctor); + NULL); if (!flash_block_cache) { printk(KERN_ERR "%s: failed to create block cache\n", __func__); @@ -820,6 +843,11 @@ static void __exit rtas_flash_cleanup(vo { rtas_flash_term_hook = NULL; + if (rtas_firmware_flash_list) { + free_flash_list(rtas_firmware_flash_list); + rtas_firmware_flash_list = NULL; + } + if (flash_block_cache) kmem_cache_destroy(flash_block_cache); diff -uprN linux-2.6.32/arch/powerpc/kernel/rtas_pci.c linux-2.6.32.ovz/arch/powerpc/kernel/rtas_pci.c --- linux-2.6.32/arch/powerpc/kernel/rtas_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/rtas_pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -276,7 +276,7 @@ void __init find_and_init_phbs(void) pci_devs_phb_init(); /* - * pci_probe_only and pci_assign_all_buses can be set via properties + * PCI_PROBE_ONLY and PCI_REASSIGN_ALL_BUS can be set via properties * in chosen. */ if (of_chosen) { @@ -284,14 +284,18 @@ void __init find_and_init_phbs(void) prop = of_get_property(of_chosen, "linux,pci-probe-only", NULL); - if (prop) - pci_probe_only = *prop; + if (prop) { + if (*prop) + pci_add_flags(PCI_PROBE_ONLY); + else + pci_clear_flags(PCI_PROBE_ONLY); + } #ifdef CONFIG_PPC32 /* Will be made generic soon */ prop = of_get_property(of_chosen, "linux,pci-assign-all-buses", NULL); if (prop && *prop) - ppc_pci_flags |= PPC_PCI_REASSIGN_ALL_BUS; + pci_add_flags(PCI_REASSIGN_ALL_BUS); #endif /* CONFIG_PPC32 */ } } diff -uprN linux-2.6.32/arch/powerpc/kernel/setup_64.c linux-2.6.32.ovz/arch/powerpc/kernel/setup_64.c --- linux-2.6.32/arch/powerpc/kernel/setup_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/setup_64.c 2015-03-10 13:22:14.000000000 -0700 @@ -96,7 +96,7 @@ int ucache_bsize; #ifdef CONFIG_SMP -static int smt_enabled_cmdline; +static char *smt_enabled_cmdline = NULL; /* Look for ibm,smt-enabled OF option */ static void check_smt_enabled(void) @@ -104,37 +104,46 @@ static void check_smt_enabled(void) struct device_node *dn; const char *smt_option; - /* Allow the command line to overrule the OF option */ - if (smt_enabled_cmdline) - return; - - dn = of_find_node_by_path("/options"); + /* Default to enabling all threads */ + smt_enabled_at_boot = threads_per_core; - if (dn) { - smt_option = of_get_property(dn, "ibm,smt-enabled", NULL); + /* Allow the command line to overrule the OF option */ + if (smt_enabled_cmdline) { + if (!strcmp(smt_enabled_cmdline, "on")) + smt_enabled_at_boot = threads_per_core; + else if (!strcmp(smt_enabled_cmdline, "off")) + smt_enabled_at_boot = 0; + else { + long smt; + int rc; + + rc = strict_strtol(smt_enabled_cmdline, 10, &smt); + if (!rc) + smt_enabled_at_boot = + min(threads_per_core, (int)smt); + } + } else { + dn = of_find_node_by_path("/options"); + if (dn) { + smt_option = of_get_property(dn, "ibm,smt-enabled", + NULL); + + if (smt_option) { + if (!strcmp(smt_option, "on")) + smt_enabled_at_boot = threads_per_core; + else if (!strcmp(smt_option, "off")) + smt_enabled_at_boot = 0; + } - if (smt_option) { - if (!strcmp(smt_option, "on")) - smt_enabled_at_boot = 1; - else if (!strcmp(smt_option, "off")) - smt_enabled_at_boot = 0; - } - } + of_node_put(dn); + } + } } /* Look for smt-enabled= cmdline option */ static int __init early_smt_enabled(char *p) { - smt_enabled_cmdline = 1; - - if (!p) - return 0; - - if (!strcmp(p, "on") || !strcmp(p, "1")) - smt_enabled_at_boot = 1; - else if (!strcmp(p, "off") || !strcmp(p, "0")) - smt_enabled_at_boot = 0; - + smt_enabled_cmdline = p; return 0; } early_param("smt-enabled", early_smt_enabled); @@ -398,8 +407,8 @@ void __init setup_system(void) */ xmon_setup(); - check_smt_enabled(); smp_setup_cpu_maps(); + check_smt_enabled(); #ifdef CONFIG_SMP /* Release secondary cpus out of their spinloops at 0x60 now that @@ -432,9 +441,18 @@ void __init setup_system(void) DBG(" <- setup_system()\n"); } +static u64 slb0_limit(void) +{ + if (cpu_has_feature(CPU_FTR_1T_SEGMENT)) { + return 1UL << SID_SHIFT_1T; + } + return 1UL << SID_SHIFT; +} + #ifdef CONFIG_IRQSTACKS static void __init irqstack_early_init(void) { + u64 limit = slb0_limit(); unsigned int i; /* @@ -444,10 +462,10 @@ static void __init irqstack_early_init(v for_each_possible_cpu(i) { softirq_ctx[i] = (struct thread_info *) __va(lmb_alloc_base(THREAD_SIZE, - THREAD_SIZE, 0x10000000)); + THREAD_SIZE, limit)); hardirq_ctx[i] = (struct thread_info *) __va(lmb_alloc_base(THREAD_SIZE, - THREAD_SIZE, 0x10000000)); + THREAD_SIZE, limit)); } } #else @@ -478,7 +496,7 @@ static void __init exc_lvl_early_init(vo */ static void __init emergency_stack_init(void) { - unsigned long limit; + u64 limit; unsigned int i; /* @@ -490,7 +508,7 @@ static void __init emergency_stack_init( * bringup, we need to get at them in real mode. This means they * must also be within the RMO region. */ - limit = min(0x10000000ULL, lmb.rmo_size); + limit = min(slb0_limit(), lmb.rmo_size); for_each_possible_cpu(i) { unsigned long sp; diff -uprN linux-2.6.32/arch/powerpc/kernel/setup-common.c linux-2.6.32.ovz/arch/powerpc/kernel/setup-common.c --- linux-2.6.32/arch/powerpc/kernel/setup-common.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/setup-common.c 2015-03-10 13:23:44.000000000 -0700 @@ -60,6 +60,7 @@ #include #include #include +#include #include "setup.h" @@ -102,6 +103,14 @@ EXPORT_SYMBOL(ppc_do_canonicalize_irqs); /* also used by kexec */ void machine_shutdown(void) { +#ifdef CONFIG_FA_DUMP + /* + * if fadump is active, cleanup the fadump registration before we + * shutdown. + */ + fadump_cleanup(); +#endif + if (ppc_md.machine_shutdown) ppc_md.machine_shutdown(); } @@ -603,6 +612,11 @@ EXPORT_SYMBOL(check_legacy_ioport); static int ppc_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { + /* + * If firmware-assisted dump has been registered then trigger + * firmware-assisted dump and let firmware handle everything else. + */ + crash_fadump(NULL, ptr); ppc_md.panic(ptr); /* May not return */ return NOTIFY_DONE; } diff -uprN linux-2.6.32/arch/powerpc/kernel/signal_32.c linux-2.6.32.ovz/arch/powerpc/kernel/signal_32.c --- linux-2.6.32/arch/powerpc/kernel/signal_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/signal_32.c 2015-03-10 13:24:04.000000000 -0700 @@ -444,6 +444,12 @@ static int save_user_regs(struct pt_regs #endif /* CONFIG_ALTIVEC */ if (copy_fpr_to_user(&frame->mc_fregs, current)) return 1; + + /* + * Clear the MSR VSX bit to indicate there is no valid state attached + * to this context, except in the specific case below where we set it. + */ + msr &= ~MSR_VSX; #ifdef CONFIG_VSX /* * Copy VSR 0-31 upper half from thread_struct to local diff -uprN linux-2.6.32/arch/powerpc/kernel/signal_64.c linux-2.6.32.ovz/arch/powerpc/kernel/signal_64.c --- linux-2.6.32/arch/powerpc/kernel/signal_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/signal_64.c 2015-03-10 13:24:04.000000000 -0700 @@ -116,6 +116,12 @@ static long setup_sigcontext(struct sigc flush_fp_to_thread(current); /* copy fpr regs and fpscr */ err |= copy_fpr_to_user(&sc->fp_regs, current); + + /* + * Clear the MSR VSX bit to indicate there is no valid state attached + * to this context, except in the specific case below where we set it. + */ + msr &= ~MSR_VSX; #ifdef CONFIG_VSX /* * Copy VSX low doubleword to local buffer for formatting, diff -uprN linux-2.6.32/arch/powerpc/kernel/signal.c linux-2.6.32.ovz/arch/powerpc/kernel/signal.c --- linux-2.6.32/arch/powerpc/kernel/signal.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/signal.c 2015-03-10 13:23:18.000000000 -0700 @@ -196,6 +196,8 @@ void do_signal(struct pt_regs *regs, uns if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff -uprN linux-2.6.32/arch/powerpc/kernel/smp.c linux-2.6.32.ovz/arch/powerpc/kernel/smp.c --- linux-2.6.32/arch/powerpc/kernel/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/smp.c 2015-03-10 13:23:56.000000000 -0700 @@ -49,6 +49,7 @@ #ifdef CONFIG_PPC64 #include #endif +#include #ifdef DEBUG #include @@ -98,6 +99,7 @@ void smp_message_recv(int msg) break; case PPC_MSG_RESCHEDULE: /* we notice need_resched on exit */ + scheduler_ipi(); break; case PPC_MSG_CALL_FUNC_SINGLE: generic_smp_call_function_single_interrupt(); @@ -127,6 +129,7 @@ static irqreturn_t call_function_action( static irqreturn_t reschedule_action(int irq, void *data) { + scheduler_ipi(); /* we just need the return path side effect of checking need_resched */ return IRQ_HANDLED; } @@ -501,11 +504,8 @@ int __devinit start_secondary(void *unus if (smp_ops->take_timebase) smp_ops->take_timebase(); - if (system_state > SYSTEM_BOOTING) - snapshot_timebase(); - secondary_cpu_time_init(); - + vdso_getcpu_init(); ipi_call_lock(); notify_cpu_starting(cpu); set_cpu_online(cpu, true); @@ -565,8 +565,6 @@ void __init smp_cpus_done(unsigned int m set_cpus_allowed(current, old_mask); - snapshot_timebases(); - dump_numa_cpu_topology(); } diff -uprN linux-2.6.32/arch/powerpc/kernel/syscalls.c linux-2.6.32.ovz/arch/powerpc/kernel/syscalls.c --- linux-2.6.32/arch/powerpc/kernel/syscalls.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/syscalls.c 2015-03-10 13:21:11.000000000 -0700 @@ -140,7 +140,6 @@ static inline unsigned long do_mmap2(uns unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) { - struct file * file = NULL; unsigned long ret = -EINVAL; if (!arch_validate_prot(prot)) @@ -151,20 +150,8 @@ static inline unsigned long do_mmap2(uns goto out; off >>= shift; } - - ret = -EBADF; - if (!(flags & MAP_ANONYMOUS)) { - if (!(file = fget(fd))) - goto out; - } - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - down_write(¤t->mm->mmap_sem); - ret = do_mmap_pgoff(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - if (file) - fput(file); + ret = sys_mmap_pgoff(addr, len, prot, flags, fd, off); out: return ret; } diff -uprN linux-2.6.32/arch/powerpc/kernel/sysfs.c linux-2.6.32.ovz/arch/powerpc/kernel/sysfs.c --- linux-2.6.32/arch/powerpc/kernel/sysfs.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/sysfs.c 2015-03-10 13:21:09.000000000 -0700 @@ -461,6 +461,25 @@ static void unregister_cpu_online(unsign cacheinfo_cpu_offline(cpu); } + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE +ssize_t arch_cpu_probe(const char *buf, size_t count) +{ + if (ppc_md.cpu_probe) + return ppc_md.cpu_probe(buf, count); + + return -EINVAL; +} + +ssize_t arch_cpu_release(const char *buf, size_t count) +{ + if (ppc_md.cpu_release) + return ppc_md.cpu_release(buf, count); + + return -EINVAL; +} +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ + #endif /* CONFIG_HOTPLUG_CPU */ static int __cpuinit sysfs_cpu_notify(struct notifier_block *self, diff -uprN linux-2.6.32/arch/powerpc/kernel/sys_ppc32.c linux-2.6.32.ovz/arch/powerpc/kernel/sys_ppc32.c --- linux-2.6.32/arch/powerpc/kernel/sys_ppc32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/sys_ppc32.c 2015-03-10 13:23:54.000000000 -0700 @@ -191,7 +191,7 @@ long compat_sys_execve(unsigned long a0, struct pt_regs *regs) { int error; - char * filename; + struct filename *filename; filename = getname((char __user *) a0); error = PTR_ERR(filename); @@ -200,7 +200,7 @@ long compat_sys_execve(unsigned long a0, flush_fp_to_thread(current); flush_altivec_to_thread(current); - error = compat_do_execve(filename, compat_ptr(a1), compat_ptr(a2), regs); + error = compat_do_execve(filename->name, compat_ptr(a1), compat_ptr(a2), regs); putname(filename); diff -uprN linux-2.6.32/arch/powerpc/kernel/time.c linux-2.6.32.ovz/arch/powerpc/kernel/time.c --- linux-2.6.32/arch/powerpc/kernel/time.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/time.c 2015-03-10 13:23:19.000000000 -0700 @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -186,6 +186,8 @@ u64 __cputime_jiffies_factor; EXPORT_SYMBOL(__cputime_jiffies_factor); u64 __cputime_msec_factor; EXPORT_SYMBOL(__cputime_msec_factor); +u64 __cputime_usec_factor; +EXPORT_SYMBOL(__cputime_usec_factor); u64 __cputime_sec_factor; EXPORT_SYMBOL(__cputime_sec_factor); u64 __cputime_clockt_factor; @@ -195,14 +197,16 @@ DEFINE_PER_CPU(unsigned long, cputime_sc cputime_t cputime_one_jiffy; +void (*dtl_consumer)(struct dtl_entry *, u64); + static void calc_cputime_factors(void) { struct div_result res; div128_by_32(HZ, 0, tb_ticks_per_sec, &res); __cputime_jiffies_factor = res.result_low; - div128_by_32(1000, 0, tb_ticks_per_sec, &res); - __cputime_msec_factor = res.result_low; + div128_by_32(1000000, 0, tb_ticks_per_sec, &res); + __cputime_usec_factor = res.result_low; div128_by_32(1, 0, tb_ticks_per_sec, &res); __cputime_sec_factor = res.result_low; div128_by_32(USER_HZ, 0, tb_ticks_per_sec, &res); @@ -210,62 +214,156 @@ static void calc_cputime_factors(void) } /* - * Read the PURR on systems that have it, otherwise the timebase. + * Read the SPURR on systems that have it, otherwise the PURR, + * or if that doesn't exist return the timebase value passed in. */ -static u64 read_purr(void) +static u64 read_spurr(u64 tb) { + if (cpu_has_feature(CPU_FTR_SPURR)) + return mfspr(SPRN_SPURR); if (cpu_has_feature(CPU_FTR_PURR)) return mfspr(SPRN_PURR); - return mftb(); + return tb; } +#ifdef CONFIG_PPC_SPLPAR + /* - * Read the SPURR on systems that have it, otherwise the purr + * Scan the dispatch trace log and count up the stolen time. + * Should be called with interrupts disabled. */ -static u64 read_spurr(u64 purr) +static u64 scan_dispatch_log(u64 stop_tb) { - /* - * cpus without PURR won't have a SPURR - * We already know the former when we use this, so tell gcc - */ - if (cpu_has_feature(CPU_FTR_PURR) && cpu_has_feature(CPU_FTR_SPURR)) - return mfspr(SPRN_SPURR); - return purr; + u64 i = local_paca->dtl_ridx; + struct dtl_entry *dtl = local_paca->dtl_curr; + struct dtl_entry *dtl_end = local_paca->dispatch_log_end; + struct lppaca *vpa = local_paca->lppaca_ptr; + u64 tb_delta; + u64 stolen = 0; + u64 dtb; + + if (!dtl) + return 0; + + if (i == vpa->dtl_idx) + return 0; + while (i < vpa->dtl_idx) { + if (dtl_consumer) + dtl_consumer(dtl, i); + dtb = dtl->timebase; + tb_delta = dtl->enqueue_to_dispatch_time + + dtl->ready_to_enqueue_time; + barrier(); + if (i + N_DISPATCH_LOG < vpa->dtl_idx) { + /* buffer has overflowed */ + i = vpa->dtl_idx - N_DISPATCH_LOG; + dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG); + continue; + } + if (dtb > stop_tb) + break; + stolen += tb_delta; + ++i; + ++dtl; + if (dtl == dtl_end) + dtl = local_paca->dispatch_log; + } + local_paca->dtl_ridx = i; + local_paca->dtl_curr = dtl; + return stolen; } /* + * Accumulate stolen time by scanning the dispatch trace log. + * Called on entry from user mode. + */ +void accumulate_stolen_time(void) +{ + u64 sst, ust; + + sst = scan_dispatch_log(get_paca()->starttime_user); + ust = scan_dispatch_log(get_paca()->starttime); + get_paca()->system_time -= sst; + get_paca()->user_time -= ust; + get_paca()->stolen_time += ust + sst; +} + +static inline u64 calculate_stolen_time(u64 stop_tb) +{ + u64 stolen = 0; + + if (get_paca()->dtl_ridx != get_paca()->lppaca_ptr->dtl_idx) { + stolen = scan_dispatch_log(stop_tb); + get_paca()->system_time -= stolen; + } + + stolen += get_paca()->stolen_time; + get_paca()->stolen_time = 0; + return stolen; +} + +#else /* CONFIG_PPC_SPLPAR */ +static inline u64 calculate_stolen_time(u64 stop_tb) +{ + return 0; +} + +#endif /* CONFIG_PPC_SPLPAR */ + +/* * Account time for a transition between system, hard irq * or soft irq state. */ void account_system_vtime(struct task_struct *tsk) { - u64 now, nowscaled, delta, deltascaled, sys_time; + u64 now, nowscaled, delta, deltascaled; unsigned long flags; + u64 stolen, udelta, sys_scaled, user_scaled; local_irq_save(flags); - now = read_purr(); + now = mftb(); nowscaled = read_spurr(now); - delta = now - get_paca()->startpurr; + get_paca()->system_time += now - get_paca()->starttime; + get_paca()->starttime = now; deltascaled = nowscaled - get_paca()->startspurr; - get_paca()->startpurr = now; get_paca()->startspurr = nowscaled; - if (!in_interrupt()) { - /* deltascaled includes both user and system time. - * Hence scale it based on the purr ratio to estimate - * the system time */ - sys_time = get_paca()->system_time; - if (get_paca()->user_time) - deltascaled = deltascaled * sys_time / - (sys_time + get_paca()->user_time); - delta += sys_time; - get_paca()->system_time = 0; + + stolen = calculate_stolen_time(now); + + delta = get_paca()->system_time; + get_paca()->system_time = 0; + udelta = get_paca()->user_time - get_paca()->utime_sspurr; + get_paca()->utime_sspurr = get_paca()->user_time; + + /* + * Because we don't read the SPURR on every kernel entry/exit, + * deltascaled includes both user and system SPURR ticks. + * Apportion these ticks to system SPURR ticks and user + * SPURR ticks in the same ratio as the system time (delta) + * and user time (udelta) values obtained from the timebase + * over the same interval. The system ticks get accounted here; + * the user ticks get saved up in paca->user_time_scaled to be + * used by account_process_tick. + */ + sys_scaled = delta; + user_scaled = udelta; + if (deltascaled != delta + udelta) { + if (udelta) { + sys_scaled = deltascaled * delta / (delta + udelta); + user_scaled = deltascaled - sys_scaled; + } else { + sys_scaled = deltascaled; + } + } + get_paca()->user_time_scaled += user_scaled; + + if (in_interrupt() || idle_task(smp_processor_id()) != tsk) { + account_system_time(tsk, 0, delta, sys_scaled); + if (stolen) + account_steal_time(stolen); + } else { + account_idle_time(delta + stolen); } - if (in_irq() || idle_task(smp_processor_id()) != tsk) - account_system_time(tsk, 0, delta, deltascaled); - else - account_idle_time(delta); - per_cpu(cputime_last_delta, smp_processor_id()) = delta; - per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled; local_irq_restore(flags); } @@ -274,125 +372,26 @@ void account_system_vtime(struct task_st * by the exception entry and exit code to the generic process * user and system time records. * Must be called with interrupts disabled. + * Assumes that account_system_vtime() has been called recently + * (i.e. since the last entry from usermode) so that + * get_paca()->user_time_scaled is up to date. */ void account_process_tick(struct task_struct *tsk, int user_tick) { cputime_t utime, utimescaled; utime = get_paca()->user_time; + utimescaled = get_paca()->user_time_scaled; get_paca()->user_time = 0; - utimescaled = cputime_to_scaled(utime); + get_paca()->user_time_scaled = 0; + get_paca()->utime_sspurr = 0; account_user_time(tsk, utime, utimescaled); } -/* - * Stuff for accounting stolen time. - */ -struct cpu_purr_data { - int initialized; /* thread is running */ - u64 tb; /* last TB value read */ - u64 purr; /* last PURR value read */ - u64 spurr; /* last SPURR value read */ -}; - -/* - * Each entry in the cpu_purr_data array is manipulated only by its - * "owner" cpu -- usually in the timer interrupt but also occasionally - * in process context for cpu online. As long as cpus do not touch - * each others' cpu_purr_data, disabling local interrupts is - * sufficient to serialize accesses. - */ -static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data); - -static void snapshot_tb_and_purr(void *data) -{ - unsigned long flags; - struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data); - - local_irq_save(flags); - p->tb = get_tb_or_rtc(); - p->purr = mfspr(SPRN_PURR); - wmb(); - p->initialized = 1; - local_irq_restore(flags); -} - -/* - * Called during boot when all cpus have come up. - */ -void snapshot_timebases(void) -{ - if (!cpu_has_feature(CPU_FTR_PURR)) - return; - on_each_cpu(snapshot_tb_and_purr, NULL, 1); -} - -/* - * Must be called with interrupts disabled. - */ -void calculate_steal_time(void) -{ - u64 tb, purr; - s64 stolen; - struct cpu_purr_data *pme; - - pme = &__get_cpu_var(cpu_purr_data); - if (!pme->initialized) - return; /* !CPU_FTR_PURR or early in early boot */ - tb = mftb(); - purr = mfspr(SPRN_PURR); - stolen = (tb - pme->tb) - (purr - pme->purr); - if (stolen > 0) { - if (idle_task(smp_processor_id()) != current) - account_steal_time(stolen); - else - account_idle_time(stolen); - } - pme->tb = tb; - pme->purr = purr; -} - -#ifdef CONFIG_PPC_SPLPAR -/* - * Must be called before the cpu is added to the online map when - * a cpu is being brought up at runtime. - */ -static void snapshot_purr(void) -{ - struct cpu_purr_data *pme; - unsigned long flags; - - if (!cpu_has_feature(CPU_FTR_PURR)) - return; - local_irq_save(flags); - pme = &__get_cpu_var(cpu_purr_data); - pme->tb = mftb(); - pme->purr = mfspr(SPRN_PURR); - pme->initialized = 1; - local_irq_restore(flags); -} - -#endif /* CONFIG_PPC_SPLPAR */ - #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */ #define calc_cputime_factors() -#define calculate_steal_time() do { } while (0) #endif -#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR)) -#define snapshot_purr() do { } while (0) -#endif - -/* - * Called when a cpu comes up after the system has finished booting, - * i.e. as a result of a hotplug cpu action. - */ -void snapshot_timebase(void) -{ - __get_cpu_var(last_jiffy) = get_tb_or_rtc(); - snapshot_purr(); -} - void __delay(unsigned long loops) { unsigned long start; @@ -422,7 +421,9 @@ void udelay(unsigned long usecs) EXPORT_SYMBOL(udelay); static inline void update_gtod(u64 new_tb_stamp, u64 new_stamp_xsec, - u64 new_tb_to_xs) + u64 new_tb_to_xs, struct timespec *now, + struct timespec *wall_time, + u32 frac_sec) { /* * tb_update_count is used to allow the userspace gettimeofday code @@ -438,9 +439,10 @@ static inline void update_gtod(u64 new_t vdso_data->tb_orig_stamp = new_tb_stamp; vdso_data->stamp_xsec = new_stamp_xsec; vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; - vdso_data->stamp_xtime = xtime; + vdso_data->wtom_clock_sec = now->tv_sec; + vdso_data->wtom_clock_nsec = now->tv_nsec; + vdso_data->stamp_xtime = *wall_time; + vdso_data->stamp_sec_fraction = frac_sec; smp_wmb(); ++(vdso_data->tb_update_count); } @@ -530,25 +532,60 @@ void __init iSeries_time_init_early(void } #endif /* CONFIG_PPC_ISERIES */ -#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32) -DEFINE_PER_CPU(u8, perf_event_pending); +#ifdef CONFIG_IRQ_WORK -void set_perf_event_pending(void) +/* + * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable... + */ +#ifdef CONFIG_PPC64 +static inline unsigned long test_irq_work_pending(void) { - get_cpu_var(perf_event_pending) = 1; - set_dec(1); - put_cpu_var(perf_event_pending); + unsigned long x; + + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, irq_work_pending))); + return x; } -#define test_perf_event_pending() __get_cpu_var(perf_event_pending) -#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 +static inline void set_irq_work_pending_flag(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (1), + "i" (offsetof(struct paca_struct, irq_work_pending))); +} -#else /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ +static inline void clear_irq_work_pending(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (0), + "i" (offsetof(struct paca_struct, irq_work_pending))); +} -#define test_perf_event_pending() 0 -#define clear_perf_event_pending() +#else /* 32-bit */ -#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ +DEFINE_PER_CPU(u8, irq_work_pending); + +#define set_irq_work_pending_flag() __get_cpu_var(irq_work_pending) = 1 +#define test_irq_work_pending() __get_cpu_var(irq_work_pending) +#define clear_irq_work_pending() __get_cpu_var(irq_work_pending) = 0 + +#endif /* 32 vs 64 bit */ + +void set_irq_work_pending(void) +{ + preempt_disable(); + set_irq_work_pending_flag(); + set_dec(1); + preempt_enable(); +} + +#else /* CONFIG_IRQ_WORK */ + +#define test_irq_work_pending() 0 +#define clear_irq_work_pending() + +#endif /* CONFIG_IRQ_WORK */ /* * For iSeries shared processors, we have to let the hypervisor @@ -576,10 +613,6 @@ void timer_interrupt(struct pt_regs * re set_dec(DECREMENTER_MAX); #ifdef CONFIG_PPC32 - if (test_perf_event_pending()) { - clear_perf_event_pending(); - perf_event_do_pending(); - } if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); #endif @@ -595,7 +628,10 @@ void timer_interrupt(struct pt_regs * re old_regs = set_irq_regs(regs); irq_enter(); - calculate_steal_time(); + if (test_irq_work_pending()) { + clear_irq_work_pending(); + irq_work_run(); + } #ifdef CONFIG_PPC_ISERIES if (firmware_has_feature(FW_FEATURE_ISERIES)) @@ -828,9 +864,11 @@ static cycle_t timebase_read(struct cloc return (cycle_t)get_tb(); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { u64 t2x, stamp_xsec; + u32 frac_sec; if (clock != &clocksource_timebase) return; @@ -841,11 +879,18 @@ void update_vsyscall(struct timespec *wa /* XXX this assumes clock->shift == 22 */ /* 4611686018 ~= 2^(20+64-22) / 1e9 */ - t2x = (u64) clock->mult * 4611686018ULL; - stamp_xsec = (u64) xtime.tv_nsec * XSEC_PER_SEC; + t2x = (u64) mult * 4611686018ULL; + stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; do_div(stamp_xsec, 1000000000); - stamp_xsec += (u64) xtime.tv_sec * XSEC_PER_SEC; - update_gtod(clock->cycle_last, stamp_xsec, t2x); + stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + + if (wall_time->tv_nsec >= 1000000000) { + ++wall_time->tv_sec; + wall_time->tv_nsec -= 1000000000; + } + /* this is tv_nsec / 1e9 as a 0.32 fraction */ + frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; + update_gtod(clock->cycle_last, stamp_xsec, t2x, wtm, wall_time, frac_sec); } void update_vsyscall_tz(void) @@ -1032,8 +1077,6 @@ void __init time_init(void) /* Save the current timebase to pretty up CONFIG_PRINTK_TIME */ boot_tb = get_tb_or_rtc(); - write_seqlock_irqsave(&xtime_lock, flags); - /* If platform provided a timezone (pmac), we correct the time */ if (timezone_offset) { sys_tz.tz_minuteswest = -timezone_offset / 60; @@ -1043,11 +1086,9 @@ void __init time_init(void) vdso_data->tb_orig_stamp = tb_last_jiffy; vdso_data->tb_update_count = 0; vdso_data->tb_ticks_per_sec = tb_ticks_per_sec; - vdso_data->stamp_xsec = (u64) xtime.tv_sec * XSEC_PER_SEC; + vdso_data->stamp_xsec = (u64) get_seconds() * XSEC_PER_SEC; vdso_data->tb_to_xs = tb_to_xs; - write_sequnlock_irqrestore(&xtime_lock, flags); - /* Start the decrementer on CPUs that have manual control * such as BookE */ diff -uprN linux-2.6.32/arch/powerpc/kernel/traps.c linux-2.6.32.ovz/arch/powerpc/kernel/traps.c --- linux-2.6.32/arch/powerpc/kernel/traps.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/traps.c 2015-03-10 13:23:44.000000000 -0700 @@ -58,6 +58,7 @@ #ifdef CONFIG_FSL_BOOKE #include #endif +#include #if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC) int (*__debugger)(struct pt_regs *regs); @@ -157,6 +158,8 @@ int die(const char *str, struct pt_regs add_taint(TAINT_DIE); spin_unlock_irqrestore(&die.lock, flags); + crash_fadump(regs, str); + if (kexec_should_crash(current) || kexec_sr_activated(smp_processor_id())) crash_kexec(regs); @@ -174,6 +177,15 @@ int die(const char *str, struct pt_regs return 0; } +void user_single_step_siginfo(struct task_struct *tsk, + struct pt_regs *regs, siginfo_t *info) +{ + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = TRAP_TRACE; + info->si_addr = (void __user *)regs->nip; +} + void _exception(int signr, struct pt_regs *regs, int code, unsigned long addr) { siginfo_t info; diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso32/getcpu.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/getcpu.S --- linux-2.6.32/arch/powerpc/kernel/vdso32/getcpu.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/getcpu.S 2015-03-10 13:23:56.000000000 -0700 @@ -0,0 +1,45 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include +#include + + .text +/* + * Exact prototype of getcpu + * + * int __kernel_getcpu(unsigned *cpu, unsigned *node); + * + */ +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + mfspr r5,SPRN_USPRG3 + cmpdi cr0,r3,0 + cmpdi cr1,r4,0 + clrlwi r6,r5,16 + rlwinm r7,r5,16,31-15,31-0 + beq cr0,1f + stw r6,0(r3) +1: beq cr1,2f + stw r7,0(r4) +2: crclr cr0*4+so + li r3,0 /* always success */ + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso32/gettimeofday.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/gettimeofday.S --- linux-2.6.32/arch/powerpc/kernel/vdso32/gettimeofday.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/gettimeofday.S 2015-03-10 13:21:35.000000000 -0700 @@ -19,8 +19,10 @@ /* Offset for the low 32-bit part of a field of long type */ #ifdef CONFIG_PPC64 #define LOPART 4 +#define TSPEC_TV_SEC TSPC64_TV_SEC+LOPART #else #define LOPART 0 +#define TSPEC_TV_SEC TSPC32_TV_SEC #endif .text @@ -41,23 +43,11 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) mr r9, r3 /* datapage ptr in r9 */ cmplwi r10,0 /* check if tv is NULL */ beq 3f - bl __do_get_xsec@local /* get xsec from tb & kernel */ - bne- 2f /* out of line -> do syscall */ - - /* seconds are xsec >> 20 */ - rlwinm r5,r4,12,20,31 - rlwimi r5,r3,12,0,19 - stw r5,TVAL32_TV_SEC(r10) - - /* get remaining xsec and convert to usec. we scale - * up remaining xsec by 12 bits and get the top 32 bits - * of the multiplication - */ - rlwinm r5,r4,12,0,19 - lis r6,1000000@h - ori r6,r6,1000000@l - mulhwu r5,r5,r6 - stw r5,TVAL32_TV_USEC(r10) + lis r7,1000000@ha /* load up USEC_PER_SEC */ + addi r7,r7,1000000@l /* so we get microseconds in r4 */ + bl __do_get_tspec@local /* get sec/usec from tb & kernel */ + stw r3,TVAL32_TV_SEC(r10) + stw r4,TVAL32_TV_USEC(r10) 3: cmplwi r11,0 /* check if tz is NULL */ beq 1f @@ -70,14 +60,6 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) crclr cr0*4+so li r3,0 blr - -2: - mtlr r12 - mr r3,r10 - mr r4,r11 - li r0,__NR_gettimeofday - sc - blr .cfi_endproc V_FUNCTION_END(__kernel_gettimeofday) @@ -100,7 +82,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) mr r11,r4 /* r11 saves tp */ bl __get_datapage@local /* get data page */ mr r9,r3 /* datapage ptr in r9 */ - + lis r7,NSEC_PER_SEC@h /* want nanoseconds */ + ori r7,r7,NSEC_PER_SEC@l 50: bl __do_get_tspec@local /* get sec/nsec from tb & kernel */ bne cr1,80f /* not monotonic -> all done */ @@ -198,83 +181,12 @@ V_FUNCTION_END(__kernel_clock_getres) /* - * This is the core of gettimeofday() & friends, it returns the xsec - * value in r3 & r4 and expects the datapage ptr (non clobbered) - * in r9. clobbers r0,r4,r5,r6,r7,r8. - * When returning, r8 contains the counter value that can be reused - * by the monotonic clock implementation - */ -__do_get_xsec: - .cfi_startproc - /* Check for update count & load values. We use the low - * order 32 bits of the update count - */ -1: lwz r8,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - andi. r0,r8,1 /* pending update ? loop */ - bne- 1b - xor r0,r8,r8 /* create dependency */ - add r9,r9,r0 - - /* Load orig stamp (offset to TB) */ - lwz r5,CFG_TB_ORIG_STAMP(r9) - lwz r6,(CFG_TB_ORIG_STAMP+4)(r9) - - /* Get a stable TB value */ -2: mftbu r3 - mftbl r4 - mftbu r0 - cmpl cr0,r3,r0 - bne- 2b - - /* Substract tb orig stamp. If the high part is non-zero, we jump to - * the slow path which call the syscall. - * If it's ok, then we have our 32 bits tb_ticks value in r7 - */ - subfc r7,r6,r4 - subfe. r0,r5,r3 - bne- 3f - - /* Load scale factor & do multiplication */ - lwz r5,CFG_TB_TO_XS(r9) /* load values */ - lwz r6,(CFG_TB_TO_XS+4)(r9) - mulhwu r4,r7,r5 - mulhwu r6,r7,r6 - mullw r0,r7,r5 - addc r6,r6,r0 - - /* At this point, we have the scaled xsec value in r4 + XER:CA - * we load & add the stamp since epoch - */ - lwz r5,CFG_STAMP_XSEC(r9) - lwz r6,(CFG_STAMP_XSEC+4)(r9) - adde r4,r4,r6 - addze r3,r5 - - /* We now have our result in r3,r4. We create a fake dependency - * on that result and re-check the counter - */ - or r6,r4,r3 - xor r0,r6,r6 - add r9,r9,r0 - lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - cmpl cr0,r8,r0 /* check if updated */ - bne- 1b - - /* Warning ! The caller expects CR:EQ to be set to indicate a - * successful calculation (so it won't fallback to the syscall - * method). We have overriden that CR bit in the counter check, - * but fortunately, the loop exit condition _is_ CR:EQ set, so - * we can exit safely here. If you change this code, be careful - * of that side effect. - */ -3: blr - .cfi_endproc - -/* - * This is the core of clock_gettime(), it returns the current - * time in seconds and nanoseconds in r3 and r4. + * This is the core of clock_gettime() and gettimeofday(), + * it returns the current time in r3 (seconds) and r4. + * On entry, r7 gives the resolution of r4, either USEC_PER_SEC + * or NSEC_PER_SEC, giving r4 in microseconds or nanoseconds. * It expects the datapage ptr in r9 and doesn't clobber it. - * It clobbers r0, r5, r6, r10 and returns NSEC_PER_SEC in r7. + * It clobbers r0, r5 and r6. * On return, r8 contains the counter value that can be reused. * This clobbers cr0 but not any other cr field. */ @@ -297,70 +209,58 @@ __do_get_tspec: 2: mftbu r3 mftbl r4 mftbu r0 - cmpl cr0,r3,r0 + cmplw cr0,r3,r0 bne- 2b /* Subtract tb orig stamp and shift left 12 bits. */ - subfc r7,r6,r4 + subfc r4,r6,r4 subfe r0,r5,r3 slwi r0,r0,12 - rlwimi. r0,r7,12,20,31 - slwi r7,r7,12 + rlwimi. r0,r4,12,20,31 + slwi r4,r4,12 - /* Load scale factor & do multiplication */ + /* + * Load scale factor & do multiplication. + * We only use the high 32 bits of the tb_to_xs value. + * Even with a 1GHz timebase clock, the high 32 bits of + * tb_to_xs will be at least 4 million, so the error from + * ignoring the low 32 bits will be no more than 0.25ppm. + * The error will just make the clock run very very slightly + * slow until the next time the kernel updates the VDSO data, + * at which point the clock will catch up to the kernel's value, + * so there is no long-term error accumulation. + */ lwz r5,CFG_TB_TO_XS(r9) /* load values */ - lwz r6,(CFG_TB_TO_XS+4)(r9) - mulhwu r3,r7,r6 - mullw r10,r7,r5 - mulhwu r4,r7,r5 - addc r10,r3,r10 + mulhwu r4,r4,r5 li r3,0 beq+ 4f /* skip high part computation if 0 */ mulhwu r3,r0,r5 - mullw r7,r0,r5 - mulhwu r5,r0,r6 - mullw r6,r0,r6 - adde r4,r4,r7 - addze r3,r3 + mullw r5,r0,r5 addc r4,r4,r5 addze r3,r3 - addc r10,r10,r6 - -4: addze r4,r4 /* add in carry */ - lis r7,NSEC_PER_SEC@h - ori r7,r7,NSEC_PER_SEC@l - mulhwu r4,r4,r7 /* convert to nanoseconds */ - - /* At this point, we have seconds & nanoseconds since the xtime - * stamp in r3+CA and r4. Load & add the xtime stamp. - */ -#ifdef CONFIG_PPC64 - lwz r5,STAMP_XTIME+TSPC64_TV_SEC+LOPART(r9) - lwz r6,STAMP_XTIME+TSPC64_TV_NSEC+LOPART(r9) -#else - lwz r5,STAMP_XTIME+TSPC32_TV_SEC(r9) - lwz r6,STAMP_XTIME+TSPC32_TV_NSEC(r9) -#endif - add r4,r4,r6 +4: + /* At this point, we have seconds since the xtime stamp + * as a 32.32 fixed-point number in r3 and r4. + * Load & add the xtime stamp. + */ + lwz r5,STAMP_XTIME+TSPEC_TV_SEC(r9) + lwz r6,STAMP_SEC_FRAC(r9) + addc r4,r4,r6 adde r3,r3,r5 - /* We now have our result in r3,r4. We create a fake dependency - * on that result and re-check the counter + /* We create a fake dependency on the result in r3/r4 + * and re-check the counter */ or r6,r4,r3 xor r0,r6,r6 add r9,r9,r0 lwz r0,(CFG_TB_UPDATE_COUNT+LOPART)(r9) - cmpl cr0,r8,r0 /* check if updated */ + cmplw cr0,r8,r0 /* check if updated */ bne- 1b - /* check for nanosecond overflow and adjust if necessary */ - cmpw r4,r7 - bltlr /* all done if no overflow */ - subf r4,r7,r4 /* adjust if overflow */ - addi r3,r3,1 + mulhwu r4,r4,r7 /* convert to micro or nanoseconds */ blr .cfi_endproc diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso32/Makefile linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/Makefile --- linux-2.6.32/arch/powerpc/kernel/vdso32/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/Makefile 2015-03-10 13:23:56.000000000 -0700 @@ -1,7 +1,9 @@ # List of files in the vdso, has to be asm only for now -obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o +obj-vdso32-$(CONFIG_PPC64) = getcpu.o +obj-vdso32 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o \ + $(obj-vdso32-y) # Build rules @@ -41,7 +43,8 @@ $(obj-vdso32): %.o: %.S # actual build commands quiet_cmd_vdso32ld = VDSO32L $@ - cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@ + cmd_vdso32ld = $(CROSS32CC) $(c_flags) -Wl,-T $^ -o $@ \ + $(if $(AFTER_LINK),; $(AFTER_LINK)) quiet_cmd_vdso32as = VDSO32A $@ cmd_vdso32as = $(CROSS32CC) $(a_flags) -c -o $@ $< diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso32/vdso32.lds.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/vdso32.lds.S --- linux-2.6.32/arch/powerpc/kernel/vdso32/vdso32.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso32/vdso32.lds.S 2015-03-10 13:23:56.000000000 -0700 @@ -147,6 +147,9 @@ VERSION __kernel_sync_dicache_p5; __kernel_sigtramp32; __kernel_sigtramp_rt32; +#ifdef CONFIG_PPC64 + __kernel_getcpu; +#endif local: *; }; diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso64/getcpu.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/getcpu.S --- linux-2.6.32/arch/powerpc/kernel/vdso64/getcpu.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/getcpu.S 2015-03-10 13:23:56.000000000 -0700 @@ -0,0 +1,45 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include +#include + + .text +/* + * Exact prototype of getcpu + * + * int __kernel_getcpu(unsigned *cpu, unsigned *node); + * + */ +V_FUNCTION_BEGIN(__kernel_getcpu) + .cfi_startproc + mfspr r5,SPRN_USPRG3 + cmpdi cr0,r3,0 + cmpdi cr1,r4,0 + clrlwi r6,r5,16 + rlwinm r7,r5,16,31-15,31-0 + beq cr0,1f + stw r6,0(r3) +1: beq cr1,2f + stw r7,0(r4) +2: crclr cr0*4+so + li r3,0 /* always success */ + blr + .cfi_endproc +V_FUNCTION_END(__kernel_getcpu) diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso64/gettimeofday.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/gettimeofday.S --- linux-2.6.32/arch/powerpc/kernel/vdso64/gettimeofday.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/gettimeofday.S 2015-03-10 13:21:35.000000000 -0700 @@ -33,18 +33,11 @@ V_FUNCTION_BEGIN(__kernel_gettimeofday) bl V_LOCAL_FUNC(__get_datapage) /* get data page */ cmpldi r11,0 /* check if tv is NULL */ beq 2f - bl V_LOCAL_FUNC(__do_get_xsec) /* get xsec from tb & kernel */ - lis r7,15 /* r7 = 1000000 = USEC_PER_SEC */ - ori r7,r7,16960 - rldicl r5,r4,44,20 /* r5 = sec = xsec / XSEC_PER_SEC */ - rldicr r6,r5,20,43 /* r6 = sec * XSEC_PER_SEC */ - std r5,TVAL64_TV_SEC(r11) /* store sec in tv */ - subf r0,r6,r4 /* r0 = xsec = (xsec - r6) */ - mulld r0,r0,r7 /* usec = (xsec * USEC_PER_SEC) / - * XSEC_PER_SEC - */ - rldicl r0,r0,44,20 - std r0,TVAL64_TV_USEC(r11) /* store usec in tv */ + lis r7,1000000@ha /* load up USEC_PER_SEC */ + addi r7,r7,1000000@l + bl V_LOCAL_FUNC(__do_get_tspec) /* get sec/us from tb & kernel */ + std r4,TVAL64_TV_SEC(r11) /* store sec in tv */ + std r5,TVAL64_TV_USEC(r11) /* store usec in tv */ 2: cmpldi r10,0 /* check if tz is NULL */ beq 1f lwz r4,CFG_TZ_MINUTEWEST(r3)/* fill tz */ @@ -77,6 +70,8 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) .cfi_register lr,r12 mr r11,r4 /* r11 saves tp */ bl V_LOCAL_FUNC(__get_datapage) /* get data page */ + lis r7,NSEC_PER_SEC@h /* want nanoseconds */ + ori r7,r7,NSEC_PER_SEC@l 50: bl V_LOCAL_FUNC(__do_get_tspec) /* get time from tb & kernel */ bne cr1,80f /* if not monotonic, all done */ @@ -171,49 +166,12 @@ V_FUNCTION_END(__kernel_clock_getres) /* - * This is the core of gettimeofday(), it returns the xsec - * value in r4 and expects the datapage ptr (non clobbered) - * in r3. clobbers r0,r4,r5,r6,r7,r8 - * When returning, r8 contains the counter value that can be reused - */ -V_FUNCTION_BEGIN(__do_get_xsec) - .cfi_startproc - /* check for update count & load values */ -1: ld r8,CFG_TB_UPDATE_COUNT(r3) - andi. r0,r8,1 /* pending update ? loop */ - bne- 1b - xor r0,r8,r8 /* create dependency */ - add r3,r3,r0 - - /* Get TB & offset it. We use the MFTB macro which will generate - * workaround code for Cell. - */ - MFTB(r7) - ld r9,CFG_TB_ORIG_STAMP(r3) - subf r7,r9,r7 - - /* Scale result */ - ld r5,CFG_TB_TO_XS(r3) - mulhdu r7,r7,r5 - - /* Add stamp since epoch */ - ld r6,CFG_STAMP_XSEC(r3) - add r4,r6,r7 - - xor r0,r4,r4 - add r3,r3,r0 - ld r0,CFG_TB_UPDATE_COUNT(r3) - cmpld cr0,r0,r8 /* check if updated */ - bne- 1b - blr - .cfi_endproc -V_FUNCTION_END(__do_get_xsec) - -/* - * This is the core of clock_gettime(), it returns the current - * time in seconds and nanoseconds in r4 and r5. + * This is the core of clock_gettime() and gettimeofday(), + * it returns the current time in r4 (seconds) and r5. + * On entry, r7 gives the resolution of r5, either USEC_PER_SEC + * or NSEC_PER_SEC, giving r5 in microseconds or nanoseconds. * It expects the datapage ptr in r3 and doesn't clobber it. - * It clobbers r0 and r6 and returns NSEC_PER_SEC in r7. + * It clobbers r0, r6 and r9. * On return, r8 contains the counter value that can be reused. * This clobbers cr0 but not any other cr field. */ @@ -229,18 +187,18 @@ V_FUNCTION_BEGIN(__do_get_tspec) /* Get TB & offset it. We use the MFTB macro which will generate * workaround code for Cell. */ - MFTB(r7) + MFTB(r6) ld r9,CFG_TB_ORIG_STAMP(r3) - subf r7,r9,r7 + subf r6,r9,r6 /* Scale result */ ld r5,CFG_TB_TO_XS(r3) - sldi r7,r7,12 /* compute time since stamp_xtime */ - mulhdu r6,r7,r5 /* in units of 2^-32 seconds */ + sldi r6,r6,12 /* compute time since stamp_xtime */ + mulhdu r6,r6,r5 /* in units of 2^-32 seconds */ /* Add stamp since epoch */ ld r4,STAMP_XTIME+TSPC64_TV_SEC(r3) - ld r5,STAMP_XTIME+TSPC64_TV_NSEC(r3) + lwz r5,STAMP_SEC_FRAC(r3) or r0,r4,r5 or r0,r0,r6 xor r0,r0,r0 @@ -250,17 +208,11 @@ V_FUNCTION_BEGIN(__do_get_tspec) bne- 1b /* reload if so */ /* convert to seconds & nanoseconds and add to stamp */ - lis r7,NSEC_PER_SEC@h - ori r7,r7,NSEC_PER_SEC@l - mulhwu r0,r6,r7 /* compute nanoseconds and */ + add r6,r6,r5 /* add on fractional seconds of xtime */ + mulhwu r5,r6,r7 /* compute micro or nanoseconds and */ srdi r6,r6,32 /* seconds since stamp_xtime */ - clrldi r0,r0,32 - add r5,r5,r0 /* add nanoseconds together */ - cmpd r5,r7 /* overflow? */ + clrldi r5,r5,32 add r4,r4,r6 - bltlr /* all done if no overflow */ - subf r5,r7,r5 /* if overflow, adjust */ - addi r4,r4,1 blr .cfi_endproc V_FUNCTION_END(__do_get_tspec) diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso64/Makefile linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/Makefile --- linux-2.6.32/arch/powerpc/kernel/vdso64/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/Makefile 2015-03-10 13:23:56.000000000 -0700 @@ -1,6 +1,6 @@ # List of files in the vdso, has to be asm only for now -obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o +obj-vdso64 = sigtramp.o gettimeofday.o datapage.o cacheflush.o note.o getcpu.o # Build rules @@ -36,7 +36,8 @@ $(obj-vdso64): %.o: %.S # actual build commands quiet_cmd_vdso64ld = VDSO64L $@ - cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@ + cmd_vdso64ld = $(CC) $(c_flags) -Wl,-T $^ -o $@ \ + $(if $(AFTER_LINK),; $(AFTER_LINK)) quiet_cmd_vdso64as = VDSO64A $@ cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $< diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso64/vdso64.lds.S linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/vdso64.lds.S --- linux-2.6.32/arch/powerpc/kernel/vdso64/vdso64.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso64/vdso64.lds.S 2015-03-10 13:23:56.000000000 -0700 @@ -146,6 +146,7 @@ VERSION __kernel_sync_dicache; __kernel_sync_dicache_p5; __kernel_sigtramp_rt64; + __kernel_getcpu; local: *; }; diff -uprN linux-2.6.32/arch/powerpc/kernel/vdso.c linux-2.6.32.ovz/arch/powerpc/kernel/vdso.c --- linux-2.6.32/arch/powerpc/kernel/vdso.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vdso.c 2015-03-10 13:23:56.000000000 -0700 @@ -264,17 +264,11 @@ int arch_setup_additional_pages(struct l * the "data" page of the vDSO or you'll stop getting kernel updates * and your nice userland gettimeofday will be totally dead. * It's fine to use that for setting breakpoints in the vDSO code - * pages though - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. + * pages though. */ rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pagelist); if (rc) { current->mm->context.vdso_base = 0; @@ -714,6 +708,31 @@ static void __init vdso_setup_syscall_ma } } +#ifdef CONFIG_PPC64 +int __cpuinit vdso_getcpu_init(void) +{ + unsigned long cpu, node, val; + + /* + * SPRG3 contains the CPU in the bottom 16 bits and the NUMA node in + * the next 16 bits. The VDSO uses this to implement getcpu(). + */ + cpu = get_cpu(); + WARN_ON_ONCE(cpu > 0xffff); + + node = cpu_to_node(cpu); + WARN_ON_ONCE(node > 0xffff); + + val = (cpu & 0xfff) | ((node & 0xffff) << 16); + mtspr(SPRN_SPRG3, val); + + put_cpu(); + + return 0; +} +/* We need to call this before SMP init */ +early_initcall(vdso_getcpu_init); +#endif static int __init vdso_init(void) { @@ -820,17 +839,17 @@ static int __init vdso_init(void) } arch_initcall(vdso_init); -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { return 0; } -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } diff -uprN linux-2.6.32/arch/powerpc/kernel/vector.S linux-2.6.32.ovz/arch/powerpc/kernel/vector.S --- linux-2.6.32/arch/powerpc/kernel/vector.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vector.S 2015-03-10 13:21:04.000000000 -0700 @@ -58,7 +58,7 @@ _GLOBAL(load_up_altivec) * all 1's */ mfspr r4,SPRN_VRSAVE - cmpdi 0,r4,0 + cmpwi 0,r4,0 bne+ 1f li r4,-1 mtspr SPRN_VRSAVE,r4 diff -uprN linux-2.6.32/arch/powerpc/kernel/vio.c linux-2.6.32.ovz/arch/powerpc/kernel/vio.c --- linux-2.6.32/arch/powerpc/kernel/vio.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vio.c 2015-03-10 13:23:53.000000000 -0700 @@ -14,7 +14,9 @@ * 2 of the License, or (at your option) any later version. */ +#include #include +#include #include #include #include @@ -699,13 +701,26 @@ static int vio_cmo_bus_probe(struct vio_ struct vio_driver *viodrv = to_vio_driver(dev->driver); unsigned long flags; size_t size; + bool dma_capable = false; - /* - * Check to see that device has a DMA window and configure - * entitlement for the device. - */ - if (of_get_property(viodev->dev.archdata.of_node, - "ibm,my-dma-window", NULL)) { + /* A device requires entitlement if it has a DMA window property */ + switch (viodev->family) { + case VDEVICE: + if (of_get_property(viodev->dev.archdata.of_node, + "ibm,my-dma-window", NULL)) + dma_capable = true; + break; + case PFO: + dma_capable = false; + break; + default: + dev_warn(dev, "unknown device family: %d\n", viodev->family); + BUG(); + break; + } + + /* Configure entitlement for the device. */ + if (dma_capable) { /* Check that the driver is CMO enabled and get desired DMA */ if (!viodrv->get_desired_dma) { dev_err(dev, "%s: device driver does not support CMO\n", @@ -1039,6 +1054,94 @@ static void vio_cmo_sysfs_init(void) { } EXPORT_SYMBOL(vio_cmo_entitlement_update); EXPORT_SYMBOL(vio_cmo_set_dev_desired); + +/* + * Platform Facilities Option (PFO) support + */ + +/** + * vio_h_cop_sync - Perform a synchronous PFO co-processor operation + * + * @vdev - Pointer to a struct vio_dev for device + * @op - Pointer to a struct vio_pfo_op for the operation parameters + * + * Calls the hypervisor to synchronously perform the PFO operation + * described in @op. In the case of a busy response from the hypervisor, + * the operation will be re-submitted indefinitely unless a non-zero timeout + * is specified or an error occurs. The timeout places a limit on when to + * stop re-submitting a operation, the total time can be exceeded if an + * operation is in progress. + * + * If op->hcall_ret is not NULL, this will be set to the return from the + * last h_cop_op call or it will be 0 if an error not involving the h_call + * was encountered. + * + * Returns: + * 0 on success, + * -EINVAL if the h_call fails due to an invalid parameter, + * -E2BIG if the h_call can not be performed synchronously, + * -EBUSY if a timeout is specified and has elapsed, + * -EACCES if the memory area for data/status has been rescinded, or + * -EPERM if a hardware fault has been indicated + */ +int vio_h_cop_sync(struct vio_dev *vdev, struct vio_pfo_op *op) +{ + struct device *dev = &vdev->dev; + unsigned long deadline = 0; + long hret = 0; + int ret = 0; + + if (op->timeout) + deadline = jiffies + msecs_to_jiffies(op->timeout); + + while (true) { + hret = plpar_hcall_norets(H_COP, op->flags, + vdev->resource_id, + op->in, op->inlen, op->out, + op->outlen, op->csbcpb); + + if (hret == H_SUCCESS || + (hret != H_NOT_ENOUGH_RESOURCES && + hret != H_BUSY && hret != H_RESOURCE) || + (op->timeout && time_after(deadline, jiffies))) + break; + + dev_dbg(dev, "%s: hcall ret(%ld), retrying.\n", __func__, hret); + } + + switch (hret) { + case H_SUCCESS: + ret = 0; + break; + case H_OP_MODE: + case H_TOO_BIG: + ret = -E2BIG; + break; + case H_RESCINDED: + ret = -EACCES; + break; + case H_HARDWARE: + ret = -EPERM; + break; + case H_NOT_ENOUGH_RESOURCES: + case H_RESOURCE: + case H_BUSY: + ret = -EBUSY; + break; + default: + ret = -EINVAL; + break; + } + + if (ret) + dev_dbg(dev, "%s: Sync h_cop_op failure (ret:%d) (hret:%ld)\n", + __func__, ret, hret); + + op->hcall_err = hret; + return ret; +} +EXPORT_SYMBOL(vio_h_cop_sync); + static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) { const unsigned char *dma_window; @@ -1053,7 +1156,7 @@ static struct iommu_table *vio_build_iom if (!dma_window) return NULL; - tbl = kmalloc(sizeof(*tbl), GFP_KERNEL); + tbl = kzalloc(sizeof(*tbl), GFP_KERNEL); if (tbl == NULL) return NULL; @@ -1066,6 +1169,7 @@ static struct iommu_table *vio_build_iom tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; tbl->it_busno = 0; tbl->it_type = TCE_VB; + tbl->it_blocksize = 16; return iommu_init_table(tbl, -1); } @@ -1194,35 +1298,87 @@ static void __devinit vio_dev_release(st struct vio_dev *vio_register_device_node(struct device_node *of_node) { struct vio_dev *viodev; + struct device_node *parent_node; const unsigned int *unit_address; + const unsigned int *pfo_resid = NULL; + enum vio_dev_family family; + const char *of_node_name = of_node->name ? of_node->name : ""; - /* we need the 'device_type' property, in order to match with drivers */ - if (of_node->type == NULL) { - printk(KERN_WARNING "%s: node %s missing 'device_type'\n", - __func__, - of_node->name ? of_node->name : ""); + /* + * Determine if this node is a under the /vdevice node or under the + * /ibm,platform-facilities node. This decides the device's family. + */ + parent_node = of_get_parent(of_node); + if (parent_node) { + if (!strcmp(parent_node->full_name, "/ibm,platform-facilities")) + family = PFO; + else if (!strcmp(parent_node->full_name, "/vdevice")) + family = VDEVICE; + else { + pr_warn("%s: parent(%s) of %s not recognized.\n", + __func__, + parent_node->full_name, + of_node_name); + of_node_put(parent_node); + return NULL; + } + of_node_put(parent_node); + } else { + pr_warn("%s: could not determine the parent of node %s.\n", + __func__, of_node_name); return NULL; } - unit_address = of_get_property(of_node, "reg", NULL); - if (unit_address == NULL) { - printk(KERN_WARNING "%s: node %s missing 'reg'\n", - __func__, - of_node->name ? of_node->name : ""); - return NULL; + if (family == PFO) { + if (of_get_property(of_node, "interrupt-controller", NULL)) { + pr_debug("%s: Skipping the interrupt controller %s.\n", + __func__, of_node_name); + return NULL; + } } /* allocate a vio_dev for this node */ viodev = kzalloc(sizeof(struct vio_dev), GFP_KERNEL); - if (viodev == NULL) + if (viodev == NULL) { + pr_warn("%s: allocation failure for VIO device.\n", __func__); return NULL; + } - viodev->irq = irq_of_parse_and_map(of_node, 0); + /* we need the 'device_type' property, in order to match with drivers */ + viodev->family = family; + if (viodev->family == VDEVICE) { + if (of_node->type != NULL) + viodev->type = of_node->type; + else { + pr_warn("%s: node %s is missing the 'device_type' " + "property.\n", __func__, of_node_name); + goto out; + } + + unit_address = of_get_property(of_node, "reg", NULL); + if (unit_address == NULL) { + pr_warn("%s: node %s missing 'reg'\n", + __func__, of_node_name); + goto out; + } + dev_set_name(&viodev->dev, "%x", *unit_address); + viodev->irq = irq_of_parse_and_map(of_node, 0); + viodev->unit_address = *unit_address; + } else { + /* PFO devices need their resource_id for submitting COP_OPs + * This is an optional field for devices, but is required when + * performing synchronous ops */ + pfo_resid = of_get_property(of_node, "ibm,resource-id", NULL); + if (pfo_resid != NULL) + viodev->resource_id = *pfo_resid; + + unit_address = NULL; + dev_set_name(&viodev->dev, "%s", of_node_name); + viodev->type = of_node_name; + viodev->irq = 0; + } - dev_set_name(&viodev->dev, "%x", *unit_address); viodev->name = of_node->name; - viodev->type = of_node->type; - viodev->unit_address = *unit_address; if (firmware_has_feature(FW_FEATURE_ISERIES)) { unit_address = of_get_property(of_node, "linux,unit_address", NULL); @@ -1231,11 +1387,6 @@ struct vio_dev *vio_register_device_node } viodev->dev.archdata.of_node = of_node_get(of_node); - if (firmware_has_feature(FW_FEATURE_CMO)) - vio_cmo_set_dma_ops(viodev); - else - viodev->dev.archdata.dma_ops = &dma_iommu_ops; - set_iommu_table_base(&viodev->dev, vio_build_iommu_table(viodev)); set_dev_node(&viodev->dev, of_node_to_nid(of_node)); /* init generic 'struct device' fields: */ @@ -1243,6 +1394,21 @@ struct vio_dev *vio_register_device_node viodev->dev.bus = &vio_bus_type; viodev->dev.release = vio_dev_release; + if (of_get_property(viodev->dev.archdata.of_node, + "ibm,my-dma-window", NULL)) { + if (firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_set_dma_ops(viodev); + else + set_dma_ops(&viodev->dev, &dma_iommu_ops); + + set_iommu_table_base(&viodev->dev, + vio_build_iommu_table(viodev)); + + /* needed to ensure proper operation of coherent allocations + * later, in case driver doesn't set it explicitly */ + dma_set_mask(&viodev->dev, DMA_BIT_MASK(64)); + dma_set_coherent_mask(&viodev->dev, DMA_BIT_MASK(64)); + } /* register with generic device framework */ if (device_register(&viodev->dev)) { printk(KERN_ERR "%s: failed to register device %s\n", @@ -1253,16 +1419,51 @@ struct vio_dev *vio_register_device_node } return viodev; + +out: /* Use this exit point for any return prior to device_register */ + kfree(viodev); + + return NULL; } EXPORT_SYMBOL(vio_register_device_node); +/* + * vio_bus_scan_for_devices - Scan OF and register each child device + * @root_name - OF node name for the root of the subtree to search. + * This must be non-NULL + * + * Starting from the root node provide, register the device node for + * each child beneath the root. + */ +static void vio_bus_scan_register_devices(char *root_name) +{ + struct device_node *node_root, *node_child; + + if (!root_name) + return; + + node_root = of_find_node_by_name(NULL, root_name); + if (node_root) { + + /* + * Create struct vio_devices for each virtual device in + * the device tree. Drivers will associate with them later. + */ + node_child = of_get_next_child(node_root, NULL); + while (node_child) { + vio_register_device_node(node_child); + node_child = of_get_next_child(node_root, node_child); + } + of_node_put(node_root); + } +} + /** * vio_bus_init: - Initialize the virtual IO bus */ static int __init vio_bus_init(void) { int err; - struct device_node *node_vroot; if (firmware_has_feature(FW_FEATURE_CMO)) vio_cmo_sysfs_init(); @@ -1287,19 +1488,8 @@ static int __init vio_bus_init(void) if (firmware_has_feature(FW_FEATURE_CMO)) vio_cmo_bus_init(); - node_vroot = of_find_node_by_name(NULL, "vdevice"); - if (node_vroot) { - struct device_node *of_node; - - /* - * Create struct vio_devices for each virtual device in - * the device tree. Drivers will associate with them later. - */ - for (of_node = node_vroot->child; of_node != NULL; - of_node = of_node->sibling) - vio_register_device_node(of_node); - of_node_put(node_vroot); - } + vio_bus_scan_register_devices("vdevice"); + vio_bus_scan_register_devices("ibm,platform-facilities"); return 0; } @@ -1319,9 +1509,27 @@ static ssize_t devspec_show(struct devic return sprintf(buf, "%s\n", of_node ? of_node->full_name : "none"); } +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + const struct vio_dev *vio_dev = to_vio_dev(dev); + struct device_node *dn; + const char *cp; + + dn = dev->archdata.of_node; + if (!dn) + return -ENODEV; + cp = of_get_property(dn, "compatible", NULL); + if (!cp) + return -ENODEV; + + return sprintf(buf, "vio:T%sS%s\n", vio_dev->type, cp); +} + static struct device_attribute vio_dev_attrs[] = { __ATTR_RO(name), __ATTR_RO(devspec), + __ATTR_RO(modalias), __ATTR_NULL }; @@ -1357,6 +1565,29 @@ static int vio_hotplug(struct device *de return 0; } +static int vio_pm_suspend(struct device *dev) +{ + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + + if (pm && pm->suspend) + return pm->suspend(dev); + return 0; +} + +static int vio_pm_resume(struct device *dev) +{ + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + + if (pm && pm->resume) + return pm->resume(dev); + return 0; +} + +const struct dev_pm_ops vio_dev_pm_ops = { + .suspend = vio_pm_suspend, + .resume = vio_pm_resume, +}; + static struct bus_type vio_bus_type = { .name = "vio", .dev_attrs = vio_dev_attrs, @@ -1364,6 +1595,7 @@ static struct bus_type vio_bus_type = { .match = vio_bus_match, .probe = vio_bus_probe, .remove = vio_bus_remove, + .pm = &vio_dev_pm_ops, }; /** @@ -1404,12 +1636,28 @@ struct vio_dev *vio_find_node(struct dev { const uint32_t *unit_address; char kobj_name[20]; + struct device_node *vnode_parent; + const char *dev_type; + + vnode_parent = of_get_parent(vnode); + if (!vnode_parent) + return NULL; + + dev_type = of_get_property(vnode_parent, "device_type", NULL); + of_node_put(vnode_parent); + if (!dev_type) + return NULL; /* construct the kobject name from the device node */ - unit_address = of_get_property(vnode, "reg", NULL); - if (!unit_address) + if (!strcmp(dev_type, "vdevice")) { + unit_address = of_get_property(vnode, "reg", NULL); + if (!unit_address) + return NULL; + snprintf(kobj_name, sizeof(kobj_name), "%x", *unit_address); + } else if (!strcmp(dev_type, "ibm,platform-facilities")) + snprintf(kobj_name, sizeof(kobj_name), "%s", vnode->name); + else return NULL; - snprintf(kobj_name, sizeof(kobj_name), "%x", *unit_address); return vio_find_name(kobj_name); } diff -uprN linux-2.6.32/arch/powerpc/kernel/vmlinux.lds.S linux-2.6.32.ovz/arch/powerpc/kernel/vmlinux.lds.S --- linux-2.6.32/arch/powerpc/kernel/vmlinux.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kernel/vmlinux.lds.S 2015-03-10 13:21:12.000000000 -0700 @@ -38,6 +38,9 @@ jiffies = jiffies_64 + 4; #endif SECTIONS { + . = 0; + reloc_start = .; + . = KERNELBASE; /* diff -uprN linux-2.6.32/arch/powerpc/kvm/Kconfig linux-2.6.32.ovz/arch/powerpc/kvm/Kconfig --- linux-2.6.32/arch/powerpc/kvm/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kvm/Kconfig 2015-03-10 13:20:59.000000000 -0700 @@ -58,6 +58,7 @@ config KVM_E500 If unsure, say N. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff -uprN linux-2.6.32/arch/powerpc/kvm/powerpc.c linux-2.6.32.ovz/arch/powerpc/kvm/powerpc.c --- linux-2.6.32/arch/powerpc/kvm/powerpc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/kvm/powerpc.c 2015-03-10 13:22:06.000000000 -0700 @@ -78,8 +78,9 @@ int kvmppc_emulate_mmio(struct kvm_run * return r; } -void kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void *garbage) { + return 0; } void kvm_arch_hardware_disable(void *garbage) @@ -135,6 +136,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm { kvmppc_free_vcpus(kvm); kvm_free_physmem(kvm); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -160,14 +162,24 @@ long kvm_arch_dev_ioctl(struct file *fil return -EINVAL; } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { return 0; } +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + return; +} + + void kvm_arch_flush_shadow(struct kvm *kvm) { } diff -uprN linux-2.6.32/arch/powerpc/lib/checksum_64.S linux-2.6.32.ovz/arch/powerpc/lib/checksum_64.S --- linux-2.6.32/arch/powerpc/lib/checksum_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/checksum_64.S 2015-03-10 13:23:55.000000000 -0700 @@ -65,165 +65,393 @@ _GLOBAL(csum_tcpudp_magic) srwi r3,r3,16 blr +#define STACKFRAMESIZE 256 +#define STK_REG(i) (112 + ((i)-14)*8) + /* * Computes the checksum of a memory block at buff, length len, * and adds in "sum" (32-bit). * - * This code assumes at least halfword alignment, though the length - * can be any number of bytes. The sum is accumulated in r5. - * * csum_partial(r3=buff, r4=len, r5=sum) */ _GLOBAL(csum_partial) - subi r3,r3,8 /* we'll offset by 8 for the loads */ - srdi. r6,r4,3 /* divide by 8 for doubleword count */ - addic r5,r5,0 /* clear carry */ - beq 3f /* if we're doing < 8 bytes */ - andi. r0,r3,2 /* aligned on a word boundary already? */ - beq+ 1f - lhz r6,8(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 - subi r4,r4,2 - addc r5,r5,r6 - srdi. r6,r4,3 /* recompute number of doublewords */ - beq 3f /* any left? */ -1: mtctr r6 -2: ldu r6,8(r3) /* main sum loop */ - adde r5,r5,r6 - bdnz 2b - andi. r4,r4,7 /* compute bytes left to sum after doublewords */ -3: cmpwi 0,r4,4 /* is at least a full word left? */ - blt 4f - lwz r6,8(r3) /* sum this word */ + addic r0,r5,0 /* clear carry */ + + srdi. r6,r4,3 /* less than 8 bytes? */ + beq .Lcsum_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcsum_aligned + + li r7,4 + sub r6,r7,r6 + mtctr r6 + +1: + lhz r6,0(r3) /* align to doubleword */ + subi r4,r4,2 + addi r3,r3,2 + adde r0,r0,r6 + bdnz 1b + +.Lcsum_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r4,7 + beq .Lcsum_tail_doublewords /* len < 128 */ + + srdi r6,r4,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + ld r6,0(r3) + ld r9,8(r3) + + ld r10,16(r3) + ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + + adde r0,r0,r11 + + adde r0,r0,r12 + + adde r0,r0,r14 + + adde r0,r0,r15 + ld r6,0(r3) + ld r9,8(r3) + + adde r0,r0,r16 + ld r10,16(r3) + ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 + ld r12,32(r3) + ld r14,40(r3) + + adde r0,r0,r9 + ld r15,48(r3) + ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 + adde r0,r0,r11 + adde r0,r0,r12 + adde r0,r0,r14 + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r4,r4,63 + +.Lcsum_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r4,3 + beq .Lcsum_tail_word + + mtctr r6 +3: + ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 + bdnz 3b + + andi. r4,r4,7 + +.Lcsum_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r4,2 + beq .Lcsum_tail_halfword + + lwz r6,0(r3) addi r3,r3,4 + adde r0,r0,r6 subi r4,r4,4 - adde r5,r5,r6 -4: cmpwi 0,r4,2 /* is at least a halfword left? */ - blt+ 5f - lhz r6,8(r3) /* sum this halfword */ - addi r3,r3,2 - subi r4,r4,2 - adde r5,r5,r6 -5: cmpwi 0,r4,1 /* is at least a byte left? */ - bne+ 6f - lbz r6,8(r3) /* sum this byte */ - slwi r6,r6,8 /* this byte is assumed to be the upper byte of a halfword */ - adde r5,r5,r6 -6: addze r5,r5 /* add in final carry */ - rldicl r4,r5,32,0 /* fold two 32-bit halves together */ - add r3,r4,r5 - srdi r3,r3,32 - blr + +.Lcsum_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r4,1 + beq .Lcsum_tail_byte + + lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 + subi r4,r4,2 + +.Lcsum_tail_byte: /* Up to 1 byte to go */ + andi. r6,r4,1 + beq .Lcsum_finish + + lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 + +.Lcsum_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + + + .macro source +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Lsrc_error + .previous + .endm + + .macro dest +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldest_error + .previous + .endm /* * Computes the checksum of a memory block at src, length len, * and adds in "sum" (32-bit), while copying the block to dst. * If an access exception occurs on src or dst, it stores -EFAULT - * to *src_err or *dst_err respectively, and (for an error on - * src) zeroes the rest of dst. - * - * This code needs to be reworked to take advantage of 64 bit sum+copy. - * However, due to tokenring halfword alignment problems this will be very - * tricky. For now we'll leave it until we instrument it somehow. + * to *src_err or *dst_err respectively. The caller must take any action + * required in this case (zeroing memory, recalculating partial checksum etc). * * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) */ _GLOBAL(csum_partial_copy_generic) - addic r0,r6,0 - subi r3,r3,4 - subi r4,r4,4 - srwi. r6,r5,2 - beq 3f /* if we're doing < 4 bytes */ - andi. r9,r4,2 /* Align dst to longword boundary */ - beq+ 1f -81: lhz r6,4(r3) /* do 2 bytes to get aligned */ - addi r3,r3,2 + addic r0,r6,0 /* clear carry */ + + srdi. r6,r5,3 /* less than 8 bytes? */ + beq .Lcopy_tail_word + + /* + * If only halfword aligned, align to a double word. Since odd + * aligned addresses should be rare and they would require more + * work to calculate the correct checksum, we ignore that case + * and take the potential slowdown of unaligned loads. + * + * If the source and destination are relatively unaligned we only + * align the source. This keeps things simple. + */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + beq .Lcopy_aligned + + li r7,4 + sub r6,r7,r6 + mtctr r6 + +1: +source; lhz r6,0(r3) /* align to doubleword */ subi r5,r5,2 -91: sth r6,4(r4) - addi r4,r4,2 - addc r0,r0,r6 - srwi. r6,r5,2 /* # words to do */ - beq 3f -1: mtctr r6 -82: lwzu r6,4(r3) /* the bdnz has zero overhead, so it should */ -92: stwu r6,4(r4) /* be unnecessary to unroll this loop */ - adde r0,r0,r6 - bdnz 82b - andi. r5,r5,3 -3: cmpwi 0,r5,2 - blt+ 4f -83: lhz r6,4(r3) addi r3,r3,2 - subi r5,r5,2 -93: sth r6,4(r4) + adde r0,r0,r6 +dest; sth r6,0(r4) addi r4,r4,2 + bdnz 1b + +.Lcopy_aligned: + /* + * We unroll the loop such that each iteration is 64 bytes with an + * entry and exit limb of 64 bytes, meaning a minimum size of + * 128 bytes. + */ + srdi. r6,r5,7 + beq .Lcopy_tail_doublewords /* len < 128 */ + + srdi r6,r5,6 + subi r6,r6,1 + mtctr r6 + + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + +source; ld r6,0(r3) +source; ld r9,8(r3) + +source; ld r10,16(r3) +source; ld r11,24(r3) + + /* + * On POWER6 and POWER7 back to back addes take 2 cycles because of + * the XER dependency. This means the fastest this loop can go is + * 16 cycles per iteration. The scheduling of the loop below has + * been shown to hit this on both POWER6 and POWER7. + */ + .align 5 +2: adde r0,r0,r6 -4: cmpwi 0,r5,1 - bne+ 5f -84: lbz r6,4(r3) -94: stb r6,4(r4) - slwi r6,r6,8 /* Upper byte of word */ - adde r0,r0,r6 -5: addze r3,r0 /* add in final carry (unlikely with 64-bit regs) */ - rldicl r4,r3,32,0 /* fold 64 bit value */ - add r3,r4,r3 - srdi r3,r3,32 - blr +source; ld r12,32(r3) +source; ld r14,40(r3) -/* These shouldn't go in the fixup section, since that would - cause the ex_table addresses to get out of order. */ + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 +source; ld r6,0(r3) +source; ld r9,8(r3) + + adde r0,r0,r16 +source; ld r10,16(r3) +source; ld r11,24(r3) + bdnz 2b + + + adde r0,r0,r6 +source; ld r12,32(r3) +source; ld r14,40(r3) + + adde r0,r0,r9 +source; ld r15,48(r3) +source; ld r16,56(r3) + addi r3,r3,64 + + adde r0,r0,r10 +dest; std r6,0(r4) +dest; std r9,8(r4) + + adde r0,r0,r11 +dest; std r10,16(r4) +dest; std r11,24(r4) + + adde r0,r0,r12 +dest; std r12,32(r4) +dest; std r14,40(r4) + + adde r0,r0,r14 +dest; std r15,48(r4) +dest; std r16,56(r4) + addi r4,r4,64 + + adde r0,r0,r15 + adde r0,r0,r16 + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + addi r1,r1,STACKFRAMESIZE + + andi. r5,r5,63 + +.Lcopy_tail_doublewords: /* Up to 127 bytes to go */ + srdi. r6,r5,3 + beq .Lcopy_tail_word - .globl src_error_1 -src_error_1: - li r6,0 - subi r5,r5,2 -95: sth r6,4(r4) - addi r4,r4,2 - srwi. r6,r5,2 - beq 3f mtctr r6 - .globl src_error_2 -src_error_2: - li r6,0 -96: stwu r6,4(r4) - bdnz 96b -3: andi. r5,r5,3 - beq src_error - .globl src_error_3 -src_error_3: - li r6,0 - mtctr r5 - addi r4,r4,3 -97: stbu r6,1(r4) - bdnz 97b - .globl src_error -src_error: +3: +source; ld r6,0(r3) + addi r3,r3,8 + adde r0,r0,r6 +dest; std r6,0(r4) + addi r4,r4,8 + bdnz 3b + + andi. r5,r5,7 + +.Lcopy_tail_word: /* Up to 7 bytes to go */ + srdi. r6,r5,2 + beq .Lcopy_tail_halfword + +source; lwz r6,0(r3) + addi r3,r3,4 + adde r0,r0,r6 +dest; stw r6,0(r4) + addi r4,r4,4 + subi r5,r5,4 + +.Lcopy_tail_halfword: /* Up to 3 bytes to go */ + srdi. r6,r5,1 + beq .Lcopy_tail_byte + +source; lhz r6,0(r3) + addi r3,r3,2 + adde r0,r0,r6 +dest; sth r6,0(r4) + addi r4,r4,2 + subi r5,r5,2 + +.Lcopy_tail_byte: /* Up to 1 byte to go */ + andi. r6,r5,1 + beq .Lcopy_finish + +source; lbz r6,0(r3) + sldi r9,r6,8 /* Pad the byte out to 16 bits */ + adde r0,r0,r9 +dest; stb r6,0(r4) + +.Lcopy_finish: + addze r0,r0 /* add in final carry */ + rldicl r4,r0,32,0 /* fold two 32 bit halves together */ + add r3,r4,r0 + srdi r3,r3,32 + blr + +.Lsrc_error: cmpdi 0,r7,0 - beq 1f + beqlr li r6,-EFAULT stw r6,0(r7) -1: addze r3,r0 blr - .globl dst_error -dst_error: +.Ldest_error: cmpdi 0,r8,0 - beq 1f + beqlr li r6,-EFAULT stw r6,0(r8) -1: addze r3,r0 blr - -.section __ex_table,"a" - .align 3 - .llong 81b,src_error_1 - .llong 91b,dst_error - .llong 82b,src_error_2 - .llong 92b,dst_error - .llong 83b,src_error_3 - .llong 93b,dst_error - .llong 84b,src_error_3 - .llong 94b,dst_error - .llong 95b,dst_error - .llong 96b,dst_error - .llong 97b,dst_error diff -uprN linux-2.6.32/arch/powerpc/lib/checksum_wrappers_64.c linux-2.6.32.ovz/arch/powerpc/lib/checksum_wrappers_64.c --- linux-2.6.32/arch/powerpc/lib/checksum_wrappers_64.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/checksum_wrappers_64.c 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,102 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2010 + * + * Author: Anton Blanchard + */ +#include +#include +#include +#include +#include + +__wsum csum_and_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_READ, src, len))) { + *err_ptr = -EFAULT; + csum = (__force unsigned int)sum; + goto out; + } + + csum = csum_partial_copy_generic((void __force *)src, dst, + len, sum, err_ptr, NULL); + + if (unlikely(*err_ptr)) { + int missing = __copy_from_user(dst, src, len); + + if (missing) { + memset(dst + len - missing, 0, missing); + *err_ptr = -EFAULT; + } else { + *err_ptr = 0; + } + + csum = csum_partial(dst, len, sum); + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_from_user); + +__wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, + __wsum sum, int *err_ptr) +{ + unsigned int csum; + + might_sleep(); + + *err_ptr = 0; + + if (!len) { + csum = 0; + goto out; + } + + if (unlikely((len < 0) || !access_ok(VERIFY_WRITE, dst, len))) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + goto out; + } + + csum = csum_partial_copy_generic(src, (void __force *)dst, + len, sum, NULL, err_ptr); + + if (unlikely(*err_ptr)) { + csum = csum_partial(src, len, sum); + + if (copy_to_user(dst, src, len)) { + *err_ptr = -EFAULT; + csum = -1; /* invalid checksum */ + } + } + +out: + return (__force __wsum)csum; +} +EXPORT_SYMBOL(csum_and_copy_to_user); diff -uprN linux-2.6.32/arch/powerpc/lib/copypage_64.S linux-2.6.32.ovz/arch/powerpc/lib/copypage_64.S --- linux-2.6.32/arch/powerpc/lib/copypage_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/copypage_64.S 2015-03-10 13:23:55.000000000 -0700 @@ -6,6 +6,7 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include #include #include #include @@ -15,9 +16,13 @@ PPC64_CACHES: .tc ppc64_caches[TC],ppc64_caches .section ".text" - -_GLOBAL(copy_4K_page) - li r5,4096 /* 4K page size */ +_GLOBAL(copy_page) +BEGIN_FTR_SECTION + lis r5,PAGE_SIZE@h +FTR_SECTION_ELSE + b .copypage_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) + ori r5,r5,PAGE_SIZE@l BEGIN_FTR_SECTION ld r10,PPC64_CACHES@toc(r2) lwz r11,DCACHEL1LOGLINESIZE(r10) /* log2 of cache line size */ diff -uprN linux-2.6.32/arch/powerpc/lib/copypage_power7.S linux-2.6.32.ovz/arch/powerpc/lib/copypage_power7.S --- linux-2.6.32/arch/powerpc/lib/copypage_power7.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/copypage_power7.S 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,168 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include +#include + +#define STACKFRAMESIZE 256 +#define STK_REG(i) (112 + ((i)-14)*8) + +_GLOBAL(copypage_power7) + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. Since source and destination are page + * aligned we don't need to clear the bottom 7 bits of either + * address. + */ + ori r9,r3,1 /* stream=1 */ + +#ifdef CONFIG_PPC_64K_PAGES + lis r7,0x0E01 /* depth=7, units=512 */ +#else + lis r7,0x0E00 /* depth=7 */ + ori r7,r7,0x1000 /* units=32 */ +#endif + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r4,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + +#ifdef CONFIG_ALTIVEC + mflr r0 + std r3,48(r1) + std r4,56(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl .enter_vmx_copy + cmpwi r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STACKFRAMESIZE+48(r1) + ld r4,STACKFRAMESIZE+56(r1) + mtlr r0 + + li r0,(PAGE_SIZE/128) + mtctr r0 + + beq .Lnonvmx_copy + + addi r1,r1,STACKFRAMESIZE + + li r6,16 + li r7,32 + li r8,48 + li r9,64 + li r10,80 + li r11,96 + li r12,112 + + .align 5 +1: lvx vr7,r0,r4 + lvx vr6,r4,r6 + lvx vr5,r4,r7 + lvx vr4,r4,r8 + lvx vr3,r4,r9 + lvx vr2,r4,r10 + lvx vr1,r4,r11 + lvx vr0,r4,r12 + addi r4,r4,128 + stvx vr7,r0,r3 + stvx vr6,r3,r6 + stvx vr5,r3,r7 + stvx vr4,r3,r8 + stvx vr3,r3,r9 + stvx vr2,r3,r10 + stvx vr1,r3,r11 + stvx vr0,r3,r12 + addi r3,r3,128 + bdnz 1b + + b .exit_vmx_copy /* tail call optimise */ + +#else + li r0,(PAGE_SIZE/128) + mtctr r0 + + stdu r1,-STACKFRAMESIZE(r1) +#endif + +.Lnonvmx_copy: + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + std r17,STK_REG(r17)(r1) + std r18,STK_REG(r18)(r1) + std r19,STK_REG(r19)(r1) + std r20,STK_REG(r20)(r1) + +1: ld r0,0(r4) + ld r5,8(r4) + ld r6,16(r4) + ld r7,24(r4) + ld r8,32(r4) + ld r9,40(r4) + ld r10,48(r4) + ld r11,56(r4) + ld r12,64(r4) + ld r14,72(r4) + ld r15,80(r4) + ld r16,88(r4) + ld r17,96(r4) + ld r18,104(r4) + ld r19,112(r4) + ld r20,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r5,8(r3) + std r6,16(r3) + std r7,24(r3) + std r8,32(r3) + std r9,40(r3) + std r10,48(r3) + std r11,56(r3) + std r12,64(r3) + std r14,72(r3) + std r15,80(r3) + std r16,88(r3) + std r17,96(r3) + std r18,104(r3) + std r19,112(r3) + std r20,120(r3) + addi r3,r3,128 + bdnz 1b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + ld r17,STK_REG(r17)(r1) + ld r18,STK_REG(r18)(r1) + ld r19,STK_REG(r19)(r1) + ld r20,STK_REG(r20)(r1) + addi r1,r1,STACKFRAMESIZE + blr diff -uprN linux-2.6.32/arch/powerpc/lib/copyuser_64.S linux-2.6.32.ovz/arch/powerpc/lib/copyuser_64.S --- linux-2.6.32/arch/powerpc/lib/copyuser_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/copyuser_64.S 2015-03-10 13:22:52.000000000 -0700 @@ -11,6 +11,12 @@ .align 7 _GLOBAL(__copy_tofrom_user) +BEGIN_FTR_SECTION + nop +FTR_SECTION_ELSE + b __copy_tofrom_user_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) +_GLOBAL(__copy_tofrom_user_base) /* first check for a whole page copy on a page boundary */ cmpldi cr1,r5,16 cmpdi cr6,r5,4096 @@ -44,37 +50,55 @@ BEGIN_FTR_SECTION andi. r0,r4,7 bne .Lsrc_unaligned END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_LD_STD) - srdi r7,r5,4 -20: ld r9,0(r4) - addi r4,r4,-8 - mtctr r7 - andi. r5,r5,7 - bf cr7*4+0,22f - addi r3,r3,8 - addi r4,r4,8 - mr r8,r9 - blt cr1,72f -21: ld r9,8(r4) -70: std r8,8(r3) -22: ldu r8,16(r4) -71: stdu r9,16(r3) + blt cr1,.Ldo_tail /* if < 16 bytes to copy */ + srdi r0,r5,5 + cmpdi cr1,r0,0 +20: ld r7,0(r4) +220: ld r6,8(r4) + addi r4,r4,16 + mtctr r0 + andi. r0,r5,0x10 + beq 22f + addi r3,r3,16 + addi r4,r4,-16 + mr r9,r7 + mr r8,r6 + beq cr1,72f +21: ld r7,16(r4) +221: ld r6,24(r4) + addi r4,r4,32 +70: std r9,0(r3) +270: std r8,8(r3) +22: ld r9,0(r4) +222: ld r8,8(r4) +71: std r7,16(r3) +271: std r6,24(r3) + addi r3,r3,32 bdnz 21b -72: std r8,8(r3) +72: std r9,0(r3) +272: std r8,8(r3) + andi. r5,r5,0xf beq+ 3f - addi r3,r3,16 + addi r4,r4,16 .Ldo_tail: - bf cr7*4+1,1f -23: lwz r9,8(r4) + addi r3,r3,16 + bf cr7*4+0,246f +244: ld r9,0(r4) + addi r4,r4,8 +245: std r9,0(r3) + addi r3,r3,8 +246: bf cr7*4+1,1f +23: lwz r9,0(r4) addi r4,r4,4 73: stw r9,0(r3) addi r3,r3,4 1: bf cr7*4+2,2f -44: lhz r9,8(r4) +44: lhz r9,0(r4) addi r4,r4,2 74: sth r9,0(r3) addi r3,r3,2 2: bf cr7*4+3,3f -45: lbz r9,8(r4) +45: lbz r9,0(r4) 75: stb r9,0(r3) 3: li r3,0 blr @@ -220,7 +244,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_ 131: addi r3,r3,8 120: +320: 122: +322: 124: 125: 126: @@ -229,9 +255,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_ 129: 133: addi r3,r3,8 -121: 132: addi r3,r3,8 +121: +321: +344: 134: 135: 138: @@ -303,18 +331,22 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_ 183: add r3,r3,r7 b 1f +371: 180: addi r3,r3,8 171: 177: addi r3,r3,8 -170: -172: +370: +372: 176: 178: addi r3,r3,4 185: addi r3,r3,4 +170: +172: +345: 173: 174: 175: @@ -341,11 +373,19 @@ END_FTR_SECTION_IFCLR(CPU_FTR_UNALIGNED_ .section __ex_table,"a" .align 3 .llong 20b,120b + .llong 220b,320b .llong 21b,121b + .llong 221b,321b .llong 70b,170b + .llong 270b,370b .llong 22b,122b + .llong 222b,322b .llong 71b,171b + .llong 271b,371b .llong 72b,172b + .llong 272b,372b + .llong 244b,344b + .llong 245b,345b .llong 23b,123b .llong 73b,173b .llong 44b,144b diff -uprN linux-2.6.32/arch/powerpc/lib/copyuser_power7.S linux-2.6.32.ovz/arch/powerpc/lib/copyuser_power7.S --- linux-2.6.32/arch/powerpc/lib/copyuser_power7.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/copyuser_power7.S 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,714 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Author: Anton Blanchard + */ +#include + +#define STACKFRAMESIZE 256 +#define STK_REG(i) (112 + ((i)-14)*8) + + .macro err1 +100: + .section __ex_table,"a" + .align 3 + .llong 100b,.Ldo_err1 + .previous + .endm + + .macro err2 +200: + .section __ex_table,"a" + .align 3 + .llong 200b,.Ldo_err2 + .previous + .endm + +#ifdef CONFIG_ALTIVEC + .macro err3 +300: + .section __ex_table,"a" + .align 3 + .llong 300b,.Ldo_err3 + .previous + .endm + + .macro err4 +400: + .section __ex_table,"a" + .align 3 + .llong 400b,.Ldo_err4 + .previous + .endm + + +.Ldo_err4: + ld r16,STK_REG(r16)(r1) + ld r15,STK_REG(r15)(r1) + ld r14,STK_REG(r14)(r1) +.Ldo_err3: + bl .exit_vmx_usercopy + ld r0,STACKFRAMESIZE+16(r1) + mtlr r0 + b .Lexit +#endif /* CONFIG_ALTIVEC */ + +.Ldo_err2: + ld r22,STK_REG(r22)(r1) + ld r21,STK_REG(r21)(r1) + ld r20,STK_REG(r20)(r1) + ld r19,STK_REG(r19)(r1) + ld r18,STK_REG(r18)(r1) + ld r17,STK_REG(r17)(r1) + ld r16,STK_REG(r16)(r1) + ld r15,STK_REG(r15)(r1) + ld r14,STK_REG(r14)(r1) +.Lexit: + addi r1,r1,STACKFRAMESIZE +.Ldo_err1: + ld r3,48(r1) + ld r4,56(r1) + ld r5,64(r1) + b __copy_tofrom_user_base + + +_GLOBAL(__copy_tofrom_user_power7) +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,48(r1) + std r4,56(r1) + std r5,64(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,48(r1) + std r4,56(r1) + std r5,64(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f +err1; lbz r0,0(r4) + addi r4,r4,1 +err1; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + std r17,STK_REG(r17)(r1) + std r18,STK_REG(r18)(r1) + std r19,STK_REG(r19)(r1) + std r20,STK_REG(r20)(r1) + std r21,STK_REG(r21)(r1) + std r22,STK_REG(r22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: +err2; ld r0,0(r4) +err2; ld r6,8(r4) +err2; ld r7,16(r4) +err2; ld r8,24(r4) +err2; ld r9,32(r4) +err2; ld r10,40(r4) +err2; ld r11,48(r4) +err2; ld r12,56(r4) +err2; ld r14,64(r4) +err2; ld r15,72(r4) +err2; ld r16,80(r4) +err2; ld r17,88(r4) +err2; ld r18,96(r4) +err2; ld r19,104(r4) +err2; ld r20,112(r4) +err2; ld r21,120(r4) + addi r4,r4,128 +err2; std r0,0(r3) +err2; std r6,8(r3) +err2; std r7,16(r3) +err2; std r8,24(r3) +err2; std r9,32(r3) +err2; std r10,40(r3) +err2; std r11,48(r3) +err2; std r12,56(r3) +err2; std r14,64(r3) +err2; std r15,72(r3) +err2; std r16,80(r3) +err2; std r17,88(r3) +err2; std r18,96(r3) +err2; std r19,104(r3) +err2; std r20,112(r3) +err2; std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + ld r17,STK_REG(r17)(r1) + ld r18,STK_REG(r18)(r1) + ld r19,STK_REG(r19)(r1) + ld r20,STK_REG(r20)(r1) + ld r21,STK_REG(r21)(r1) + ld r22,STK_REG(r22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) +err1; ld r9,32(r4) +err1; ld r10,40(r4) +err1; ld r11,48(r4) +err1; ld r12,56(r4) + addi r4,r4,64 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) +err1; std r9,32(r3) +err1; std r10,40(r3) +err1; std r11,48(r3) +err1; std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f +err1; ld r0,0(r4) +err1; ld r6,8(r4) +err1; ld r7,16(r4) +err1; ld r8,24(r4) + addi r4,r4,32 +err1; std r0,0(r3) +err1; std r6,8(r3) +err1; std r7,16(r3) +err1; std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f +err1; ld r0,0(r4) +err1; ld r6,8(r4) + addi r4,r4,16 +err1; std r0,0(r3) +err1; std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f +err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err1; lwz r6,4(r4) + addi r4,r4,8 +err1; stw r0,0(r3) +err1; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err1; lwz r0,0(r4) + addi r4,r4,4 +err1; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err1; lhz r0,0(r4) + addi r4,r4,2 +err1; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err1; lbz r0,0(r4) +err1; stb r0,0(r3) + +15: li r3,0 + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl .enter_vmx_usercopy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STACKFRAMESIZE+48(r1) + ld r4,STACKFRAMESIZE+56(r1) + ld r5,STACKFRAMESIZE+64(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r6,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f +err3; lvx vr1,r0,r4 + addi r4,r4,16 +err3; stvx vr1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f +err3; lvx vr1,r0,r4 +err3; lvx vr0,r4,r9 + addi r4,r4,32 +err3; stvx vr1,r0,r3 +err3; stvx vr0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx vr3,r0,r4 +err3; lvx vr2,r4,r9 +err3; lvx vr1,r4,r10 +err3; lvx vr0,r4,r11 + addi r4,r4,64 +err3; stvx vr3,r0,r3 +err3; stvx vr2,r3,r9 +err3; stvx vr1,r3,r10 +err3; stvx vr0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx vr7,r0,r4 +err4; lvx vr6,r4,r9 +err4; lvx vr5,r4,r10 +err4; lvx vr4,r4,r11 +err4; lvx vr3,r4,r12 +err4; lvx vr2,r4,r14 +err4; lvx vr1,r4,r15 +err4; lvx vr0,r4,r16 + addi r4,r4,128 +err4; stvx vr7,r0,r3 +err4; stvx vr6,r3,r9 +err4; stvx vr5,r3,r10 +err4; stvx vr4,r3,r11 +err4; stvx vr3,r3,r12 +err4; stvx vr2,r3,r14 +err4; stvx vr1,r3,r15 +err4; stvx vr0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx vr3,r0,r4 +err3; lvx vr2,r4,r9 +err3; lvx vr1,r4,r10 +err3; lvx vr0,r4,r11 + addi r4,r4,64 +err3; stvx vr3,r0,r3 +err3; stvx vr2,r3,r9 +err3; stvx vr1,r3,r10 +err3; stvx vr0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx vr1,r0,r4 +err3; lvx vr0,r4,r9 + addi r4,r4,32 +err3; stvx vr1,r0,r3 +err3; stvx vr0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx vr1,r0,r4 + addi r4,r4,16 +err3; stvx vr1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; ld r0,0(r4) + addi r4,r4,8 +err3; std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b .exit_vmx_usercopy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f +err3; lbz r0,0(r4) + addi r4,r4,1 +err3; stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r7,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + lvsl vr16,0,r4 /* Setup permute control vector */ +err3; lvx vr0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f +err3; lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + addi r4,r4,16 +err3; stvx vr8,r0,r3 + addi r3,r3,16 + vor vr0,vr1,vr1 + +5: bf cr7*4+2,6f +err3; lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 +err3; lvx vr0,r4,r9 + vperm vr9,vr1,vr0,vr16 + addi r4,r4,32 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f +err3; lvx vr3,r0,r4 + vperm vr8,vr0,vr3,vr16 +err3; lvx vr2,r4,r9 + vperm vr9,vr3,vr2,vr16 +err3; lvx vr1,r4,r10 + vperm vr10,vr2,vr1,vr16 +err3; lvx vr0,r4,r11 + vperm vr11,vr1,vr0,vr16 + addi r4,r4,64 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 +err3; stvx vr10,r3,r10 +err3; stvx vr11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: +err4; lvx vr7,r0,r4 + vperm vr8,vr0,vr7,vr16 +err4; lvx vr6,r4,r9 + vperm vr9,vr7,vr6,vr16 +err4; lvx vr5,r4,r10 + vperm vr10,vr6,vr5,vr16 +err4; lvx vr4,r4,r11 + vperm vr11,vr5,vr4,vr16 +err4; lvx vr3,r4,r12 + vperm vr12,vr4,vr3,vr16 +err4; lvx vr2,r4,r14 + vperm vr13,vr3,vr2,vr16 +err4; lvx vr1,r4,r15 + vperm vr14,vr2,vr1,vr16 +err4; lvx vr0,r4,r16 + vperm vr15,vr1,vr0,vr16 + addi r4,r4,128 +err4; stvx vr8,r0,r3 +err4; stvx vr9,r3,r9 +err4; stvx vr10,r3,r10 +err4; stvx vr11,r3,r11 +err4; stvx vr12,r3,r12 +err4; stvx vr13,r3,r14 +err4; stvx vr14,r3,r15 +err4; stvx vr15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f +err3; lvx vr3,r0,r4 + vperm vr8,vr0,vr3,vr16 +err3; lvx vr2,r4,r9 + vperm vr9,vr3,vr2,vr16 +err3; lvx vr1,r4,r10 + vperm vr10,vr2,vr1,vr16 +err3; lvx vr0,r4,r11 + vperm vr11,vr1,vr0,vr16 + addi r4,r4,64 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 +err3; stvx vr10,r3,r10 +err3; stvx vr11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f +err3; lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 +err3; lvx vr0,r4,r9 + vperm vr9,vr1,vr0,vr16 + addi r4,r4,32 +err3; stvx vr8,r0,r3 +err3; stvx vr9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f +err3; lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + addi r4,r4,16 +err3; stvx vr8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f +err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ +err3; lwz r6,4(r4) + addi r4,r4,8 +err3; stw r0,0(r3) +err3; stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f +err3; lwz r0,0(r4) + addi r4,r4,4 +err3; stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f +err3; lhz r0,0(r4) + addi r4,r4,2 +err3; sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f +err3; lbz r0,0(r4) +err3; stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + b .exit_vmx_usercopy /* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff -uprN linux-2.6.32/arch/powerpc/lib/locks.c linux-2.6.32.ovz/arch/powerpc/lib/locks.c --- linux-2.6.32/arch/powerpc/lib/locks.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/locks.c 2015-03-10 13:24:15.000000000 -0700 @@ -84,12 +84,16 @@ void __rw_yield(raw_rwlock_t *rw) void __raw_spin_unlock_wait(raw_spinlock_t *lock) { + smp_mb(); + while (lock->slock) { HMT_low(); if (SHARED_PROCESSOR) __spin_yield(lock); } HMT_medium(); + + smp_mb(); } EXPORT_SYMBOL(__raw_spin_unlock_wait); diff -uprN linux-2.6.32/arch/powerpc/lib/Makefile linux-2.6.32.ovz/arch/powerpc/lib/Makefile --- linux-2.6.32/arch/powerpc/lib/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/Makefile 2015-03-10 13:23:55.000000000 -0700 @@ -17,12 +17,15 @@ obj-$(CONFIG_PPC32) += div64.o copy_32.o obj-$(CONFIG_HAS_IOMEM) += devres.o obj-$(CONFIG_PPC64) += copypage_64.o copyuser_64.o \ - memcpy_64.o usercopy_64.o mem_64.o string.o + memcpy_64.o usercopy_64.o mem_64.o string.o \ + checksum_wrappers_64.o copyuser_power7.o \ + copypage_power7.o memcpy_power7.o obj-$(CONFIG_XMON) += sstep.o obj-$(CONFIG_KPROBES) += sstep.o ifeq ($(CONFIG_PPC64),y) obj-$(CONFIG_SMP) += locks.o +obj-$(CONFIG_ALTIVEC) += vmx-helper.o endif obj-$(CONFIG_PPC_LIB_RHEAP) += rheap.o diff -uprN linux-2.6.32/arch/powerpc/lib/memcpy_64.S linux-2.6.32.ovz/arch/powerpc/lib/memcpy_64.S --- linux-2.6.32/arch/powerpc/lib/memcpy_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/memcpy_64.S 2015-03-10 13:23:55.000000000 -0700 @@ -11,7 +11,11 @@ .align 7 _GLOBAL(memcpy) +BEGIN_FTR_SECTION std r3,48(r1) /* save destination pointer for return value */ +FTR_SECTION_ELSE + b memcpy_power7 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) PPC_MTOCRF 0x01,r5 cmpldi cr1,r5,16 neg r6,r3 # LS 3 bits = # bytes to 8-byte dest bdry diff -uprN linux-2.6.32/arch/powerpc/lib/memcpy_power7.S linux-2.6.32.ovz/arch/powerpc/lib/memcpy_power7.S --- linux-2.6.32/arch/powerpc/lib/memcpy_power7.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/memcpy_power7.S 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,650 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2012 + * + * Author: Anton Blanchard + */ +#include + +#define STACKFRAMESIZE 256 +#define STK_REG(i) (112 + ((i)-14)*8) + +_GLOBAL(memcpy_power7) +#ifdef CONFIG_ALTIVEC + cmpldi r5,16 + cmpldi cr1,r5,4096 + + std r3,48(r1) + + blt .Lshort_copy + bgt cr1,.Lvmx_copy +#else + cmpldi r5,16 + + std r3,48(r1) + + blt .Lshort_copy +#endif + +.Lnonvmx_copy: + /* Get the source 8B aligned */ + neg r6,r4 + mtocrf 0x01,r6 + clrldi r6,r6,(64-3) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: sub r5,r5,r6 + cmpldi r5,128 + blt 5f + + mflr r0 + stdu r1,-STACKFRAMESIZE(r1) + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + std r17,STK_REG(r17)(r1) + std r18,STK_REG(r18)(r1) + std r19,STK_REG(r19)(r1) + std r20,STK_REG(r20)(r1) + std r21,STK_REG(r21)(r1) + std r22,STK_REG(r22)(r1) + std r0,STACKFRAMESIZE+16(r1) + + srdi r6,r5,7 + mtctr r6 + + /* Now do cacheline (128B) sized loads and stores. */ + .align 5 +4: + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + ld r14,64(r4) + ld r15,72(r4) + ld r16,80(r4) + ld r17,88(r4) + ld r18,96(r4) + ld r19,104(r4) + ld r20,112(r4) + ld r21,120(r4) + addi r4,r4,128 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + std r14,64(r3) + std r15,72(r3) + std r16,80(r3) + std r17,88(r3) + std r18,96(r3) + std r19,104(r3) + std r20,112(r3) + std r21,120(r3) + addi r3,r3,128 + bdnz 4b + + clrldi r5,r5,(64-7) + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + ld r17,STK_REG(r17)(r1) + ld r18,STK_REG(r18)(r1) + ld r19,STK_REG(r19)(r1) + ld r20,STK_REG(r20)(r1) + ld r21,STK_REG(r21)(r1) + ld r22,STK_REG(r22)(r1) + addi r1,r1,STACKFRAMESIZE + + /* Up to 127B to go */ +5: srdi r6,r5,4 + mtocrf 0x01,r6 + +6: bf cr7*4+1,7f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + ld r9,32(r4) + ld r10,40(r4) + ld r11,48(r4) + ld r12,56(r4) + addi r4,r4,64 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + std r9,32(r3) + std r10,40(r3) + std r11,48(r3) + std r12,56(r3) + addi r3,r3,64 + + /* Up to 63B to go */ +7: bf cr7*4+2,8f + ld r0,0(r4) + ld r6,8(r4) + ld r7,16(r4) + ld r8,24(r4) + addi r4,r4,32 + std r0,0(r3) + std r6,8(r3) + std r7,16(r3) + std r8,24(r3) + addi r3,r3,32 + + /* Up to 31B to go */ +8: bf cr7*4+3,9f + ld r0,0(r4) + ld r6,8(r4) + addi r4,r4,16 + std r0,0(r3) + std r6,8(r3) + addi r3,r3,16 + +9: clrldi r5,r5,(64-4) + + /* Up to 15B to go */ +.Lshort_copy: + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: ld r3,48(r1) + blr + +.Lunwind_stack_nonvmx_copy: + addi r1,r1,STACKFRAMESIZE + b .Lnonvmx_copy + +#ifdef CONFIG_ALTIVEC +.Lvmx_copy: + mflr r0 + std r4,56(r1) + std r5,64(r1) + std r0,16(r1) + stdu r1,-STACKFRAMESIZE(r1) + bl .enter_vmx_copy + cmpwi cr1,r3,0 + ld r0,STACKFRAMESIZE+16(r1) + ld r3,STACKFRAMESIZE+48(r1) + ld r4,STACKFRAMESIZE+56(r1) + ld r5,STACKFRAMESIZE+64(r1) + mtlr r0 + + /* + * We prefetch both the source and destination using enhanced touch + * instructions. We use a stream ID of 0 for the load side and + * 1 for the store side. + */ + clrrdi r6,r4,7 + clrrdi r9,r3,7 + ori r9,r9,1 /* stream=1 */ + + srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ + cmpldi r7,0x3FF + ble 1f + li r7,0x3FF +1: lis r0,0x0E00 /* depth=7 */ + sldi r7,r7,7 + or r7,r7,r0 + ori r10,r7,1 /* stream=1 */ + + lis r8,0x8000 /* GO=1 */ + clrldi r8,r8,32 + +.machine push +.machine "power4" + dcbt r0,r6,0b01000 + dcbt r0,r7,0b01010 + dcbtst r0,r9,0b01000 + dcbtst r0,r10,0b01010 + eieio + dcbt r0,r8,0b01010 /* GO */ +.machine pop + + beq cr1,.Lunwind_stack_nonvmx_copy + + /* + * If source and destination are not relatively aligned we use a + * slower permute loop. + */ + xor r6,r4,r3 + rldicl. r6,r6,0,(64-4) + bne .Lvmx_unaligned_copy + + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + bf cr7*4+3,5f + lvx vr1,r0,r4 + addi r4,r4,16 + stvx vr1,r0,r3 + addi r3,r3,16 + +5: bf cr7*4+2,6f + lvx vr1,r0,r4 + lvx vr0,r4,r9 + addi r4,r4,32 + stvx vr1,r0,r3 + stvx vr0,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx vr3,r0,r4 + lvx vr2,r4,r9 + lvx vr1,r4,r10 + lvx vr0,r4,r11 + addi r4,r4,64 + stvx vr3,r0,r3 + stvx vr2,r3,r9 + stvx vr1,r3,r10 + stvx vr0,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx vr7,r0,r4 + lvx vr6,r4,r9 + lvx vr5,r4,r10 + lvx vr4,r4,r11 + lvx vr3,r4,r12 + lvx vr2,r4,r14 + lvx vr1,r4,r15 + lvx vr0,r4,r16 + addi r4,r4,128 + stvx vr7,r0,r3 + stvx vr6,r3,r9 + stvx vr5,r3,r10 + stvx vr4,r3,r11 + stvx vr3,r3,r12 + stvx vr2,r3,r14 + stvx vr1,r3,r15 + stvx vr0,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx vr3,r0,r4 + lvx vr2,r4,r9 + lvx vr1,r4,r10 + lvx vr0,r4,r11 + addi r4,r4,64 + stvx vr3,r0,r3 + stvx vr2,r3,r9 + stvx vr1,r3,r10 + stvx vr0,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx vr1,r0,r4 + lvx vr0,r4,r9 + addi r4,r4,32 + stvx vr1,r0,r3 + stvx vr0,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx vr1,r0,r4 + addi r4,r4,16 + stvx vr1,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + mtocrf 0x01,r5 + bf cr7*4+0,12f + ld r0,0(r4) + addi r4,r4,8 + std r0,0(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,48(r1) + b .exit_vmx_copy /* tail call optimise */ + +.Lvmx_unaligned_copy: + /* Get the destination 16B aligned */ + neg r6,r3 + mtocrf 0x01,r6 + clrldi r6,r6,(64-4) + + bf cr7*4+3,1f + lbz r0,0(r4) + addi r4,r4,1 + stb r0,0(r3) + addi r3,r3,1 + +1: bf cr7*4+2,2f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +2: bf cr7*4+1,3f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +3: bf cr7*4+0,4f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r7,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r7,4(r3) + addi r3,r3,8 + +4: sub r5,r5,r6 + + /* Get the desination 128B aligned */ + neg r6,r3 + srdi r7,r6,4 + mtocrf 0x01,r7 + clrldi r6,r6,(64-7) + + li r9,16 + li r10,32 + li r11,48 + + lvsl vr16,0,r4 /* Setup permute control vector */ + lvx vr0,0,r4 + addi r4,r4,16 + + bf cr7*4+3,5f + lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + addi r4,r4,16 + stvx vr8,r0,r3 + addi r3,r3,16 + vor vr0,vr1,vr1 + +5: bf cr7*4+2,6f + lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + lvx vr0,r4,r9 + vperm vr9,vr1,vr0,vr16 + addi r4,r4,32 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + addi r3,r3,32 + +6: bf cr7*4+1,7f + lvx vr3,r0,r4 + vperm vr8,vr0,vr3,vr16 + lvx vr2,r4,r9 + vperm vr9,vr3,vr2,vr16 + lvx vr1,r4,r10 + vperm vr10,vr2,vr1,vr16 + lvx vr0,r4,r11 + vperm vr11,vr1,vr0,vr16 + addi r4,r4,64 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + addi r3,r3,64 + +7: sub r5,r5,r6 + srdi r6,r5,7 + + std r14,STK_REG(r14)(r1) + std r15,STK_REG(r15)(r1) + std r16,STK_REG(r16)(r1) + + li r12,64 + li r14,80 + li r15,96 + li r16,112 + + mtctr r6 + + /* + * Now do cacheline sized loads and stores. By this stage the + * cacheline stores are also cacheline aligned. + */ + .align 5 +8: + lvx vr7,r0,r4 + vperm vr8,vr0,vr7,vr16 + lvx vr6,r4,r9 + vperm vr9,vr7,vr6,vr16 + lvx vr5,r4,r10 + vperm vr10,vr6,vr5,vr16 + lvx vr4,r4,r11 + vperm vr11,vr5,vr4,vr16 + lvx vr3,r4,r12 + vperm vr12,vr4,vr3,vr16 + lvx vr2,r4,r14 + vperm vr13,vr3,vr2,vr16 + lvx vr1,r4,r15 + vperm vr14,vr2,vr1,vr16 + lvx vr0,r4,r16 + vperm vr15,vr1,vr0,vr16 + addi r4,r4,128 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + stvx vr12,r3,r12 + stvx vr13,r3,r14 + stvx vr14,r3,r15 + stvx vr15,r3,r16 + addi r3,r3,128 + bdnz 8b + + ld r14,STK_REG(r14)(r1) + ld r15,STK_REG(r15)(r1) + ld r16,STK_REG(r16)(r1) + + /* Up to 127B to go */ + clrldi r5,r5,(64-7) + srdi r6,r5,4 + mtocrf 0x01,r6 + + bf cr7*4+1,9f + lvx vr3,r0,r4 + vperm vr8,vr0,vr3,vr16 + lvx vr2,r4,r9 + vperm vr9,vr3,vr2,vr16 + lvx vr1,r4,r10 + vperm vr10,vr2,vr1,vr16 + lvx vr0,r4,r11 + vperm vr11,vr1,vr0,vr16 + addi r4,r4,64 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + stvx vr10,r3,r10 + stvx vr11,r3,r11 + addi r3,r3,64 + +9: bf cr7*4+2,10f + lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + lvx vr0,r4,r9 + vperm vr9,vr1,vr0,vr16 + addi r4,r4,32 + stvx vr8,r0,r3 + stvx vr9,r3,r9 + addi r3,r3,32 + +10: bf cr7*4+3,11f + lvx vr1,r0,r4 + vperm vr8,vr0,vr1,vr16 + addi r4,r4,16 + stvx vr8,r0,r3 + addi r3,r3,16 + + /* Up to 15B to go */ +11: clrldi r5,r5,(64-4) + addi r4,r4,-16 /* Unwind the +16 load offset */ + mtocrf 0x01,r5 + bf cr7*4+0,12f + lwz r0,0(r4) /* Less chance of a reject with word ops */ + lwz r6,4(r4) + addi r4,r4,8 + stw r0,0(r3) + stw r6,4(r3) + addi r3,r3,8 + +12: bf cr7*4+1,13f + lwz r0,0(r4) + addi r4,r4,4 + stw r0,0(r3) + addi r3,r3,4 + +13: bf cr7*4+2,14f + lhz r0,0(r4) + addi r4,r4,2 + sth r0,0(r3) + addi r3,r3,2 + +14: bf cr7*4+3,15f + lbz r0,0(r4) + stb r0,0(r3) + +15: addi r1,r1,STACKFRAMESIZE + ld r3,48(r1) + b .exit_vmx_copy /* tail call optimise */ +#endif /* CONFiG_ALTIVEC */ diff -uprN linux-2.6.32/arch/powerpc/lib/string.S linux-2.6.32.ovz/arch/powerpc/lib/string.S --- linux-2.6.32/arch/powerpc/lib/string.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/string.S 2015-03-10 13:21:32.000000000 -0700 @@ -28,7 +28,7 @@ _GLOBAL(strcpy) /* This clears out any unused part of the destination buffer, just as the libc version does. -- paulus */ _GLOBAL(strncpy) - cmpwi 0,r5,0 + PPC_LCMPI 0,r5,0 beqlr mtctr r5 addi r6,r3,-1 @@ -39,7 +39,7 @@ _GLOBAL(strncpy) bdnzf 2,1b /* dec ctr, branch if ctr != 0 && !cr0.eq */ bnelr /* if we didn't hit a null char, we're done */ mfctr r5 - cmpwi 0,r5,0 /* any space left in destination buffer? */ + PPC_LCMPI 0,r5,0 /* any space left in destination buffer? */ beqlr /* we know r0 == 0 here */ 2: stbu r0,1(r6) /* clear it out if so */ bdnz 2b @@ -70,8 +70,8 @@ _GLOBAL(strcmp) blr _GLOBAL(strncmp) - PPC_LCMPI r5,0 - beqlr + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r5,r3,-1 addi r4,r4,-1 @@ -82,6 +82,8 @@ _GLOBAL(strncmp) beqlr 1 bdnzt eq,1b blr +2: li r3,0 + blr _GLOBAL(strlen) addi r4,r3,-1 @@ -92,8 +94,8 @@ _GLOBAL(strlen) blr _GLOBAL(memcmp) - cmpwi 0,r5,0 - ble- 2f + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r6,r3,-1 addi r4,r4,-1 @@ -106,8 +108,8 @@ _GLOBAL(memcmp) blr _GLOBAL(memchr) - cmpwi 0,r5,0 - ble- 2f + PPC_LCMPI 0,r5,0 + beq- 2f mtctr r5 addi r3,r3,-1 1: lbzu r0,1(r3) diff -uprN linux-2.6.32/arch/powerpc/lib/vmx-helper.c linux-2.6.32.ovz/arch/powerpc/lib/vmx-helper.c --- linux-2.6.32/arch/powerpc/lib/vmx-helper.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/lib/vmx-helper.c 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,73 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2011 + * + * Authors: Sukadev Bhattiprolu + * Anton Blanchard + */ +#include +#include + +int enter_vmx_usercopy(void) +{ + if (in_interrupt()) + return 0; + + /* This acts as preempt_disable() as well and will make + * enable_kernel_altivec(). We need to disable page faults + * as they can call schedule and thus make us lose the VMX + * context. So on page faults, we just fail which will cause + * a fallback to the normal non-vmx copy. + */ + pagefault_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * This function must return 0 because we tail call optimise when calling + * from __copy_tofrom_user_power7 which returns 0 on success. + */ +int exit_vmx_usercopy(void) +{ + pagefault_enable(); + return 0; +} + +int enter_vmx_copy(void) +{ + if (in_interrupt()) + return 0; + + preempt_disable(); + + enable_kernel_altivec(); + + return 1; +} + +/* + * All calls to this function will be optimised into tail calls. We are + * passed a pointer to the destination which we return as required by a + * memcpy implementation. + */ +void *exit_vmx_copy(void *dest) +{ + preempt_enable(); + return dest; +} diff -uprN linux-2.6.32/arch/powerpc/Makefile linux-2.6.32.ovz/arch/powerpc/Makefile --- linux-2.6.32/arch/powerpc/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/Makefile 2015-03-10 13:21:53.000000000 -0700 @@ -158,9 +158,11 @@ drivers-$(CONFIG_OPROFILE) += arch/power # Default to zImage, override when needed all: zImage -BOOT_TARGETS = zImage zImage.initrd uImage zImage% dtbImage% treeImage.% cuImage.% simpleImage.% +# With make 3.82 we cannot mix normal and wildcard targets +BOOT_TARGETS1 := zImage zImage.initrd uImaged +BOOT_TARGETS2 := zImage% dtbImage% treeImage.% cuImage.% simpleImage.% -PHONY += $(BOOT_TARGETS) +PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2) boot := arch/$(ARCH)/boot @@ -175,10 +177,16 @@ relocs_check: arch/powerpc/relocs_check. zImage: relocs_check endif -$(BOOT_TARGETS): vmlinux +$(BOOT_TARGETS1): vmlinux + $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) +$(BOOT_TARGETS2): vmlinux + $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) + + +bootwrapper_install: $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) -bootwrapper_install %.dtb: +%.dtb: $(Q)$(MAKE) ARCH=ppc64 $(build)=$(boot) $(patsubst %,$(boot)/%,$@) define archhelp diff -uprN linux-2.6.32/arch/powerpc/mm/fault.c linux-2.6.32.ovz/arch/powerpc/mm/fault.c --- linux-2.6.32/arch/powerpc/mm/fault.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/fault.c 2015-03-10 13:24:15.000000000 -0700 @@ -125,6 +125,7 @@ int __kprobes do_page_fault(struct pt_re int is_write = 0, ret; int trap = TRAP(regs); int is_exec = trap == 0x400; + unsigned int flags = 0; #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) /* @@ -141,6 +142,12 @@ int __kprobes do_page_fault(struct pt_re is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ + if (is_write) + flags |= FAULT_FLAG_WRITE; + + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; + if (notify_page_fault(regs)) return 0; @@ -171,7 +178,7 @@ int __kprobes do_page_fault(struct pt_re die("Weird page fault", regs, SIGSEGV); } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -302,7 +309,7 @@ good_area: * the fault. */ survive: - ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0); + ret = handle_mm_fault(mm, vma, address, flags); if (unlikely(ret & VM_FAULT_ERROR)) { if (ret & VM_FAULT_OOM) goto out_of_memory; @@ -312,7 +319,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -323,7 +330,7 @@ good_area: #endif } else { current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } up_read(&mm->mmap_sem); @@ -353,15 +360,10 @@ bad_area_nosemaphore: */ out_of_memory: up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); - if (user_mode(regs)) - do_group_exit(SIGKILL); - return SIGKILL; + if (!user_mode(regs)) + return SIGKILL; + pagefault_out_of_memory(); + return 0; do_sigbus: up_read(&mm->mmap_sem); diff -uprN linux-2.6.32/arch/powerpc/mm/fsl_booke_mmu.c linux-2.6.32.ovz/arch/powerpc/mm/fsl_booke_mmu.c --- linux-2.6.32/arch/powerpc/mm/fsl_booke_mmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/fsl_booke_mmu.c 2015-03-10 13:21:39.000000000 -0700 @@ -131,15 +131,10 @@ void settlbcam(int index, unsigned long TLBCAM[index].MAS3 = (phys & PAGE_MASK) | MAS3_SX | MAS3_SR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_SW : 0); -#ifndef CONFIG_KGDB /* want user access for breakpoints */ if (flags & _PAGE_USER) { TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); } -#else - TLBCAM[index].MAS3 |= MAS3_UX | MAS3_UR; - TLBCAM[index].MAS3 |= ((flags & _PAGE_RW) ? MAS3_UW : 0); -#endif tlbcam_addrs[index].start = virt; tlbcam_addrs[index].limit = virt + size - 1; diff -uprN linux-2.6.32/arch/powerpc/mm/gup.c linux-2.6.32.ovz/arch/powerpc/mm/gup.c --- linux-2.6.32/arch/powerpc/mm/gup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/gup.c 2015-03-10 13:22:51.000000000 -0700 @@ -62,7 +62,7 @@ static noinline int gup_huge_pte(pte_t * { unsigned long mask; unsigned long pte_end; - struct page *head, *page; + struct page *head, *page, *tail; pte_t pte; int refs; @@ -82,6 +82,7 @@ static noinline int gup_huge_pte(pte_t * refs = 0; head = pte_page(pte); page = head + ((*addr & ~huge_page_mask(hstate)) >> PAGE_SHIFT); + tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -96,10 +97,20 @@ static noinline int gup_huge_pte(pte_t * } if (unlikely(pte_val(pte) != pte_val(*ptep))) { /* Could be optimized better */ - while (*nr) { - put_page(page); - (*nr)--; - } + *nr -= refs; + while (refs--) + put_page(head); + return 0; + } + + /* + * Any tail page need their mapcount reference taken before we + * return. + */ + while (refs--) { + if (PageTail(tail)) + get_huge_page_tail(tail); + tail++; } return 1; diff -uprN linux-2.6.32/arch/powerpc/mm/hash_utils_64.c linux-2.6.32.ovz/arch/powerpc/mm/hash_utils_64.c --- linux-2.6.32/arch/powerpc/mm/hash_utils_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/hash_utils_64.c 2015-03-10 13:23:44.000000000 -0700 @@ -53,6 +53,7 @@ #include #include #include +#include #ifdef DEBUG #define DBG(fmt...) udbg_printf(fmt) @@ -541,11 +542,11 @@ static unsigned long __init htab_get_tab } #ifdef CONFIG_MEMORY_HOTPLUG -void create_section_mapping(unsigned long start, unsigned long end) +int create_section_mapping(unsigned long start, unsigned long end) { - BUG_ON(htab_bolt_mapping(start, end, __pa(start), + return htab_bolt_mapping(start, end, __pa(start), pgprot_val(PAGE_KERNEL), mmu_linear_psize, - mmu_kernel_ssize)); + mmu_kernel_ssize); } int remove_section_mapping(unsigned long start, unsigned long end) @@ -625,6 +626,16 @@ static void __init htab_initialize(void) /* Using a hypervisor which owns the htab */ htab_address = NULL; _SDR1 = 0; +#ifdef CONFIG_FA_DUMP + /* + * If firmware assisted dump is active firmware preserves + * the contents of htab along with entire partition memory. + * Clear the htab if firmware assisted dump is active so + * that we dont end up using old mappings. + */ + if (is_fadump_active() && ppc_md.hpte_clear_all) + ppc_md.hpte_clear_all(); +#endif } else { /* Find storage for the HPT. Must be contiguous in * the absolute address space. On cell we want it to be @@ -879,6 +890,18 @@ static inline int subpage_protection(pgd } #endif +void hash_failure_debug(unsigned long ea, unsigned long access, + unsigned long vsid, unsigned long trap, + int ssize, int psize, unsigned long pte) +{ + if (!printk_ratelimit()) + return; + pr_info("mm: Hashing failure ! EA=0x%lx access=0x%lx current=%s\n", + ea, access, current->comm); + pr_info(" trap=0x%lx vsid=0x%lx ssize=%d psize=%d pte=0x%lx\n", + trap, vsid, ssize, psize, pte); +} + /* Result code is: * 0 - handled * 1 - normal page fault @@ -1039,6 +1062,12 @@ int hash_page(unsigned long ea, unsigned local, ssize, spp); } + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, psize, + pte_val(*ptep)); #ifndef CONFIG_PPC_64K_PAGES DBG_LOW(" o-pte: %016lx\n", pte_val(*ptep)); #else @@ -1057,8 +1086,7 @@ void hash_preload(struct mm_struct *mm, void *pgdir; pte_t *ptep; unsigned long flags; - int local = 0; - int ssize; + int rc, ssize, local = 0; BUG_ON(REGION_ID(ea) != USER_REGION_ID); @@ -1104,11 +1132,18 @@ void hash_preload(struct mm_struct *mm, /* Hash it in */ #ifdef CONFIG_PPC_HAS_HASH_64K if (mm->context.user_psize == MMU_PAGE_64K) - __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); + rc = __hash_page_64K(ea, access, vsid, ptep, trap, local, ssize); else #endif /* CONFIG_PPC_HAS_HASH_64K */ - __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, - subpage_protection(pgdir, ea)); + rc = __hash_page_4K(ea, access, vsid, ptep, trap, local, ssize, + subpage_protection(pgdir, ea)); + + /* Dump some info in case of hash insertion failure, they should + * never happen so it is really useful to know if/when they do + */ + if (rc == -1) + hash_failure_debug(ea, access, vsid, trap, ssize, + mm->context.user_psize, pte_val(*ptep)); local_irq_restore(flags); } diff -uprN linux-2.6.32/arch/powerpc/mm/hugetlbpage.c linux-2.6.32.ovz/arch/powerpc/mm/hugetlbpage.c --- linux-2.6.32/arch/powerpc/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/hugetlbpage.c 2015-03-10 13:23:25.000000000 -0700 @@ -226,7 +226,7 @@ pte_t *huge_pte_offset(struct mm_struct } pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) + unsigned long addr, unsigned long sz, bool *shared) { pgd_t *pg; pud_t *pu; @@ -556,32 +556,27 @@ int hash_huge_page(struct mm_struct *mm, unsigned long old_pte, new_pte; unsigned long va, rflags, pa, sz; long slot; - int err = 1; int ssize = user_segment_size(ea); unsigned int mmu_psize; int shift; + mmu_psize = get_slice_psize(mm, ea); if (!mmu_huge_psizes[mmu_psize]) - goto out; + return 1; ptep = huge_pte_offset(mm, ea); /* Search the Linux page table for a match with va */ va = hpt_va(ea, vsid, ssize); - /* - * If no pte found or not present, send the problem up to - * do_page_fault + /* If no pte found send the problem up to do_page_fault */ - if (unlikely(!ptep || pte_none(*ptep))) - goto out; + if (unlikely(!ptep)) + return 1; + + /* We need _PAGE_PRESENT as part of our access check */ + access |= _PAGE_PRESENT; - /* - * Check the user's access rights to the page. If access should be - * prevented then send the problem up to do_page_fault. - */ - if (unlikely(access & ~pte_val(*ptep))) - goto out; /* * At this point, we have a pte (old_pte) which can be used to build * or update an HPTE. There are 2 cases: @@ -593,13 +588,19 @@ int hash_huge_page(struct mm_struct *mm, * because we are doing software DIRTY bit management and the * page is currently not DIRTY. */ - - do { old_pte = pte_val(*ptep); - if (old_pte & _PAGE_BUSY) - goto out; + /* If PTE busy, retry the access */ + if (unlikely(old_pte & _PAGE_BUSY)) + return 0; + /* If PTE permissions don't match, take page fault */ + if (unlikely(access & ~old_pte)) + return 1; + /* Try to lock the PTE, add ACCESSED and DIRTY if it was + * a write access */ new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED; + if (access & _PAGE_RW) + new_pte |= _PAGE_DIRTY; } while(old_pte != __cmpxchg_u64((unsigned long *)ptep, old_pte, new_pte)); @@ -671,8 +672,16 @@ repeat: } } - if (unlikely(slot == -2)) - panic("hash_huge_page: pte_insert failed\n"); + /* + * Hypervisor failure. Restore old pte and return -1 + * similar to __hash_page_* + */ + if (unlikely(slot == -2)) { + *ptep = __pte(old_pte); + hash_failure_debug(ea, access, vsid, trap, ssize, + mmu_psize, old_pte); + return -1; + } new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX); } @@ -681,11 +690,7 @@ repeat: * No need to use ldarx/stdcx here */ *ptep = __pte(new_pte & ~_PAGE_BUSY); - - err = 0; - - out: - return err; + return 0; } static void __init set_huge_psize(int psize) diff -uprN linux-2.6.32/arch/powerpc/mm/init_64.c linux-2.6.32.ovz/arch/powerpc/mm/init_64.c --- linux-2.6.32/arch/powerpc/mm/init_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/init_64.c 2015-03-10 13:24:03.000000000 -0700 @@ -77,7 +77,9 @@ #endif /* CONFIG_PPC_STD_MMU_64 */ phys_addr_t memstart_addr = ~0; +EXPORT_SYMBOL_GPL(memstart_addr); phys_addr_t kernstart_addr; +EXPORT_SYMBOL_GPL(kernstart_addr); void free_initmem(void) { @@ -217,19 +219,55 @@ static void __meminit vmemmap_create_map } #endif /* CONFIG_PPC_BOOK3E */ -int __meminit vmemmap_populate(struct page *start_page, - unsigned long nr_pages, int node) +struct vmemmap_backing *vmemmap_list; + +static __meminit struct vmemmap_backing * vmemmap_list_alloc(int node) +{ + static struct vmemmap_backing *next; + static int num_left; + + /* allocate a page when required and hand out chunks */ + if (!next || !num_left) { + next = vmemmap_alloc_block(PAGE_SIZE, node); + if (unlikely(!next)) { + WARN_ON(1); + return NULL; + } + num_left = PAGE_SIZE / sizeof(struct vmemmap_backing); + } + + num_left--; + + return next++; +} + +static __meminit void vmemmap_list_populate(unsigned long phys, + unsigned long start, + int node) +{ + struct vmemmap_backing *vmem_back; + + vmem_back = vmemmap_list_alloc(node); + if (unlikely(!vmem_back)) { + WARN_ON(1); + return; + } + + vmem_back->phys = phys; + vmem_back->virt_addr = start; + vmem_back->list = vmemmap_list; + + vmemmap_list = vmem_back; +} + +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { - unsigned long start = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + nr_pages); unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift; /* Align to the page size of the linear mapping. */ start = _ALIGN_DOWN(start, page_size); - pr_debug("vmemmap_populate page %p, %ld pages, node %d\n", - start_page, nr_pages, node); - pr_debug(" -> map %lx..%lx\n", start, end); + pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node); for (; start < end; start += page_size) { void *p; @@ -241,6 +279,8 @@ int __meminit vmemmap_populate(struct pa if (!p) return -ENOMEM; + vmemmap_list_populate(__pa(p), start, node); + pr_debug(" * %016lx..%016lx allocated at %p\n", start, start + page_size, p); diff -uprN linux-2.6.32/arch/powerpc/mm/mem.c linux-2.6.32.ovz/arch/powerpc/mm/mem.c --- linux-2.6.32/arch/powerpc/mm/mem.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/mem.c 2015-03-10 13:23:48.000000000 -0700 @@ -47,6 +47,7 @@ #include #include #include +#include #include "mmu_decl.h" @@ -57,7 +58,7 @@ int init_bootmem_done; int mem_init_done; -phys_addr_t memory_limit; +unsigned long long memory_limit; #ifdef CONFIG_HIGHMEM pte_t *kmap_pte; @@ -127,7 +128,8 @@ int arch_add_memory(int nid, u64 start, pgdata = NODE_DATA(nid); start = (unsigned long)__va(start); - create_section_mapping(start, start + size); + if (create_section_mapping(start, start + size)) + return -EINVAL; /* this should work for most non-highmem platforms */ zone = pgdata->node_zones; @@ -511,3 +513,23 @@ void update_mmu_cache(struct vm_area_str hash_preload(vma->vm_mm, address, access, trap); #endif /* CONFIG_PPC_STD_MMU */ } + +#ifdef CONFIG_STRICT_DEVMEM +/* + * devmem_is_allowed(): check to see if /dev/mem access to a certain address + * is valid. The argument is a physical page number. + * + * Access has to be given to non-kernel-ram areas as well, these contain the + * PCI mmio resources as well as potential bios/acpi data regions. + */ +int devmem_is_allowed(unsigned long pfn) +{ + if (iomem_is_exclusive(pfn << PAGE_SHIFT)) + return 0; + if (!page_is_ram(pfn)) + return 1; + if (page_is_rtas_user_buf(pfn)) + return 1; + return 0; +} +#endif /* CONFIG_STRICT_DEVMEM */ diff -uprN linux-2.6.32/arch/powerpc/mm/numa.c linux-2.6.32.ovz/arch/powerpc/mm/numa.c --- linux-2.6.32/arch/powerpc/mm/numa.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/numa.c 2015-03-10 13:24:10.000000000 -0700 @@ -20,10 +20,15 @@ #include #include #include +#include +#include #include #include #include #include +#include +#include +#include static int numa_enabled = 1; @@ -42,6 +47,12 @@ EXPORT_SYMBOL(node_data); static int min_common_depth; static int n_mem_addr_cells, n_mem_size_cells; +static int form1_affinity; + +#define MAX_DISTANCE_REF_POINTS 4 +static int distance_ref_points_depth; +static const unsigned int *distance_ref_points; +static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS]; static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn, unsigned int *nid) @@ -132,7 +143,7 @@ static void __init get_node_active_regio work_with_active_regions(nid, get_active_region_work_fn, node_ar); } -static void __cpuinit map_cpu_to_node(int cpu, int node) +static void map_cpu_to_node(int cpu, int node) { numa_cpu_lookup_table[cpu] = node; @@ -142,7 +153,7 @@ static void __cpuinit map_cpu_to_node(in cpu_set(cpu, numa_cpumask_lookup_table[node]); } -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR) static void unmap_cpu_from_node(unsigned long cpu) { int node = numa_cpu_lookup_table[cpu]; @@ -156,7 +167,7 @@ static void unmap_cpu_from_node(unsigned cpu, node); } } -#endif /* CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */ /* must hold reference to node during call */ static const int *of_get_associativity(struct device_node *dev) @@ -179,31 +190,78 @@ static const u32 *of_get_usable_memory(s return prop; } +int __node_distance(int a, int b) +{ + int i; + int distance = LOCAL_DISTANCE; + + if (!form1_affinity) + return distance; + + for (i = 0; i < distance_ref_points_depth; i++) { + if (distance_lookup_table[a][i] == distance_lookup_table[b][i]) + break; + + /* Double the distance for each NUMA level */ + distance *= 2; + } + + return distance; +} +EXPORT_SYMBOL(__node_distance); + +static void initialize_distance_lookup_table(int nid, + const unsigned int *associativity) +{ + int i; + + if (!form1_affinity) + return; + + for (i = 0; i < distance_ref_points_depth; i++) { + distance_lookup_table[nid][i] = + associativity[distance_ref_points[i]]; + } +} + /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa * info is found. */ -static int of_node_to_nid_single(struct device_node *device) +static int associativity_to_nid(const unsigned int *associativity) { int nid = -1; - const unsigned int *tmp; if (min_common_depth == -1) goto out; - tmp = of_get_associativity(device); - if (!tmp) - goto out; - - if (tmp[0] >= min_common_depth) - nid = tmp[min_common_depth]; + if (associativity[0] >= min_common_depth) + nid = associativity[min_common_depth]; /* POWER4 LPAR uses 0xffff as invalid node */ if (nid == 0xffff || nid >= MAX_NUMNODES) nid = -1; + + if (nid > 0 && associativity[0] >= distance_ref_points_depth) + initialize_distance_lookup_table(nid, associativity); + out: return nid; } +/* Returns the nid associated with the given device tree node, + * or -1 if not found. + */ +static int of_node_to_nid_single(struct device_node *device) +{ + int nid = -1; + const unsigned int *tmp; + + tmp = of_get_associativity(device); + if (tmp) + nid = associativity_to_nid(tmp); + return nid; +} + /* Walk the device tree upwards, looking for an associativity id */ int of_node_to_nid(struct device_node *device) { @@ -226,26 +284,12 @@ int of_node_to_nid(struct device_node *d } EXPORT_SYMBOL_GPL(of_node_to_nid); -/* - * In theory, the "ibm,associativity" property may contain multiple - * associativity lists because a resource may be multiply connected - * into the machine. This resource then has different associativity - * characteristics relative to its multiple connections. We ignore - * this for now. We also assume that all cpu and memory sets have - * their distances represented at a common level. This won't be - * true for hierarchical NUMA. - * - * In any case the ibm,associativity-reference-points should give - * the correct depth for a normal NUMA system. - * - * - Dave Hansen - */ static int __init find_min_common_depth(void) { int depth; - const unsigned int *ref_points; struct device_node *rtas_root; - unsigned int len; + struct device_node *chosen; + const char *vec5; rtas_root = of_find_node_by_path("/rtas"); @@ -253,23 +297,67 @@ static int __init find_min_common_depth( return -1; /* - * this property is 2 32-bit integers, each representing a level of - * depth in the associativity nodes. The first is for an SMP - * configuration (should be all 0's) and the second is for a normal - * NUMA configuration. + * This property is a set of 32-bit integers, each representing + * an index into the ibm,associativity nodes. + * + * With form 0 affinity the first integer is for an SMP configuration + * (should be all 0's) and the second is for a normal NUMA + * configuration. We have only one level of NUMA. + * + * With form 1 affinity the first integer is the most significant + * NUMA boundary and the following are progressively less significant + * boundaries. There can be more than one level of NUMA. */ - ref_points = of_get_property(rtas_root, - "ibm,associativity-reference-points", &len); + distance_ref_points = of_get_property(rtas_root, + "ibm,associativity-reference-points", + &distance_ref_points_depth); - if ((len >= 2 * sizeof(unsigned int)) && ref_points) { - depth = ref_points[1]; - } else { + if (!distance_ref_points) { dbg("NUMA: ibm,associativity-reference-points not found.\n"); - depth = -1; + goto err; + } + + distance_ref_points_depth /= sizeof(int); + +#define VEC5_AFFINITY_BYTE 5 +#define VEC5_AFFINITY 0x80 + chosen = of_find_node_by_path("/chosen"); + if (chosen) { + vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL); + if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) { + dbg("Using form 1 affinity\n"); + form1_affinity = 1; + } + } + + if (form1_affinity) { + depth = distance_ref_points[0]; + } else { + if (distance_ref_points_depth < 2) { + printk(KERN_WARNING "NUMA: " + "short ibm,associativity-reference-points\n"); + goto err; + } + + depth = distance_ref_points[1]; + } + + /* + * Warn and cap if the hardware supports more than + * MAX_DISTANCE_REF_POINTS domains. + */ + if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) { + printk(KERN_WARNING "NUMA: distance array capped at " + "%d entries\n", MAX_DISTANCE_REF_POINTS); + distance_ref_points_depth = MAX_DISTANCE_REF_POINTS; } - of_node_put(rtas_root); + of_node_put(rtas_root); return depth; + +err: + of_node_put(rtas_root); + return -1; } static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells) @@ -1141,4 +1229,270 @@ int hot_add_scn_to_nid(unsigned long scn return nid; } +static u64 hot_add_drconf_memory_max(void) +{ + struct device_node *memory = NULL; + unsigned int drconf_cell_cnt = 0; + u64 lmb_size = 0; + const u32 *dm = 0; + + memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (memory) { + drconf_cell_cnt = of_get_drconf_memory(memory, &dm); + lmb_size = of_get_lmb_size(memory); + of_node_put(memory); + } + return lmb_size * drconf_cell_cnt; +} + +/* + * memory_hotplug_max - return max address of memory that may be added + * + * This is currently only used on systems that support drconfig memory + * hotplug. + */ +u64 memory_hotplug_max(void) +{ + return max(hot_add_drconf_memory_max(), lmb_end_of_DRAM()); +} #endif /* CONFIG_MEMORY_HOTPLUG */ + +/* Virtual Processor Home Node (VPHN) support */ +#ifdef CONFIG_PPC_SPLPAR +#define VPHN_NR_CHANGE_CTRS (8) +static u8 vphn_cpu_change_counts[NR_CPUS][VPHN_NR_CHANGE_CTRS]; +static cpumask_t cpu_associativity_changes_mask; +static int vphn_enabled; +static void set_topology_timer(void); + +/* + * Store the current values of the associativity change counters in the + * hypervisor. + */ +static void setup_cpu_associativity_change_counters(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + int i; + u8 *counts = vphn_cpu_change_counts[cpu]; + volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + + for (i = 0; i < VPHN_NR_CHANGE_CTRS; i++) + counts[i] = hypervisor_counts[i]; + } +} + +/* + * The hypervisor maintains a set of 8 associativity change counters in + * the VPA of each cpu that correspond to the associativity levels in the + * ibm,associativity-reference-points property. When an associativity + * level changes, the corresponding counter is incremented. + * + * Set a bit in cpu_associativity_changes_mask for each cpu whose home + * node associativity levels have changed. + * + * Returns the number of cpus with unhandled associativity changes. + */ +static int update_cpu_associativity_changes_mask(void) +{ + int cpu, nr_cpus = 0; + cpumask_t *changes = &cpu_associativity_changes_mask; + + cpumask_clear(changes); + + for_each_possible_cpu(cpu) { + int i, changed = 0; + u8 *counts = vphn_cpu_change_counts[cpu]; + volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts; + + for (i = 0; i < VPHN_NR_CHANGE_CTRS; i++) { + if (hypervisor_counts[i] > counts[i]) { + counts[i] = hypervisor_counts[i]; + changed = 1; + } + } + if (changed) { + cpumask_set_cpu(cpu, changes); + nr_cpus++; + } + } + + return nr_cpus; +} + +static void topology_work_fn(struct work_struct *work) +{ + rebuild_sched_domains(); +} +static DECLARE_WORK(topology_work, topology_work_fn); + +void topology_schedule_update(void) +{ + schedule_work(&topology_work); +} + +static void topology_timer_fn(unsigned long ignored) +{ + if (!vphn_enabled) + return; + if (update_cpu_associativity_changes_mask() > 0) + topology_schedule_update(); + set_topology_timer(); +} +static struct timer_list topology_timer = + TIMER_INITIALIZER(topology_timer_fn, 0, 0); + +static void set_topology_timer(void) +{ + topology_timer.data = 0; + topology_timer.expires = jiffies + 60 * HZ; + add_timer(&topology_timer); +} + +/* + * Start polling for VPHN associativity changes. + */ +int start_topology_update(void) +{ + int rc = 0; + + /* Disabled until races with load balancing are fixed */ + if (0 && firmware_has_feature(FW_FEATURE_VPHN)) { + vphn_enabled = 1; + setup_cpu_associativity_change_counters(); + init_timer_deferrable(&topology_timer); + set_topology_timer(); + rc = 1; + } + + return rc; +} +__initcall(start_topology_update); + +/* + * Disable polling for VPHN associativity changes. + */ +int stop_topology_update(void) +{ + vphn_enabled = 0; + return del_timer_sync(&topology_timer); +} +#endif /* CONFIG_PPC_SPLPAR */ + +/* 6 64-bit registers unpacked into 12 32-bit associativity values */ +#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32)) + +/* + * Convert the associativity domain numbers returned from the hypervisor + * to the sequence they would appear in the ibm,associativity property. + */ +static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked) +{ + int i, nr_assoc_doms = 0; + const u16 *field = (const u16*) packed; + +#define VPHN_FIELD_UNUSED (0xffff) +#define VPHN_FIELD_MSB (0x8000) +#define VPHN_FIELD_MASK (~VPHN_FIELD_MSB) + + for (i = 0; i < VPHN_ASSOC_BUFSIZE; i++) { + if (*field == VPHN_FIELD_UNUSED) { + /* All significant fields processed, and remaining + * fields contain the reserved value of all 1's. + * Just store them. + */ + unpacked[i] = *((u32*)field); + field += 2; + } else if (*field & VPHN_FIELD_MSB) { + /* Data is in the lower 15 bits of this field */ + unpacked[i] = *field & VPHN_FIELD_MASK; + field++; + nr_assoc_doms++; + } else { + /* Data is in the lower 15 bits of this field + * concatenated with the next 16 bit field + */ + unpacked[i] = *((u32*)field); + field += 2; + nr_assoc_doms++; + } + } + + return nr_assoc_doms; +} + +/* + * Retrieve the new associativity information for a virtual processor's + * home node. + */ +static long hcall_vphn(unsigned long cpu, unsigned int *associativity) +{ + long rc; + long retbuf[PLPAR_HCALL9_BUFSIZE] = {0}; + u64 flags = 1; + int hwcpu = get_hard_smp_processor_id(cpu); + + rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu); + vphn_unpack_associativity(retbuf, associativity); + + return rc; +} + +static long vphn_get_associativity(unsigned long cpu, + unsigned int *associativity) +{ + long rc = hcall_vphn(cpu, associativity); + + switch (rc) { + case H_FUNCTION: + printk(KERN_INFO + "VPHN is not supported. Disabling polling...\n"); + stop_topology_update(); + break; + case H_HARDWARE: + printk(KERN_ERR + "hcall_vphn() experienced a hardware fault " + "preventing VPHN. Disabling polling...\n"); + stop_topology_update(); + } + + return rc; +} + +/* + * Update the node maps and sysfs entries for each cpu whose home node + * has changed. + */ +int arch_update_cpu_topology(void) +{ + int cpu, nid, old_nid; + unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0}; + struct sys_device *sysdev; + + for_each_cpu_mask(cpu, cpu_associativity_changes_mask) { + vphn_get_associativity(cpu, associativity); + nid = associativity_to_nid(associativity); + + if (nid < 0 || !node_online(nid)) + nid = first_online_node; + + old_nid = numa_cpu_lookup_table[cpu]; + + /* Disable hotplug while we update the cpu + * masks and sysfs. + */ + get_online_cpus(); + unregister_cpu_under_node(cpu, old_nid); + unmap_cpu_from_node(cpu); + map_cpu_to_node(cpu, nid); + register_cpu_under_node(cpu, nid); + put_online_cpus(); + + sysdev = get_cpu_sysdev(cpu); + if (sysdev) + kobject_uevent(&sysdev->kobj, KOBJ_CHANGE); + } + + return 1; +} diff -uprN linux-2.6.32/arch/powerpc/mm/slice.c linux-2.6.32.ovz/arch/powerpc/mm/slice.c --- linux-2.6.32/arch/powerpc/mm/slice.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/mm/slice.c 2015-03-10 13:24:05.000000000 -0700 @@ -230,9 +230,10 @@ static unsigned long slice_find_area_bot unsigned long start_addr, addr; struct slice_mask mask; int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned int unmap_factor = sysctl_unmap_area_factor; if (use_cache) { - if (len <= mm->cached_hole_size) { + if (len <= mm->cached_hole_size && !unmap_factor) { start_addr = addr = TASK_UNMAPPED_BASE; mm->cached_hole_size = 0; } else @@ -264,7 +265,9 @@ full_search: mm->free_area_cache = addr + len; return addr; } - if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) + + if (use_cache && !unmap_factor && + (addr + mm->cached_hole_size) < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; } @@ -272,7 +275,8 @@ full_search: /* Make sure we didn't miss any holes */ if (use_cache && start_addr != TASK_UNMAPPED_BASE) { start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; + if (likely(!unmap_factor)) + mm->cached_hole_size = 0; goto full_search; } return -ENOMEM; @@ -287,10 +291,13 @@ static unsigned long slice_find_area_top unsigned long addr; struct slice_mask mask; int pshift = max_t(int, mmu_psize_defs[psize].shift, PAGE_SHIFT); + unsigned int unmap_factor = sysctl_unmap_area_factor; + int firsttime = 1; + again: /* check if free_area_cache is useful for us */ if (use_cache) { - if (len <= mm->cached_hole_size) { + if (len <= mm->cached_hole_size && !unmap_factor) { mm->cached_hole_size = 0; mm->free_area_cache = mm->mmap_base; } @@ -344,7 +351,8 @@ static unsigned long slice_find_area_top } /* remember the largest hole we saw so far */ - if (use_cache && (addr + mm->cached_hole_size) < vma->vm_start) + if (use_cache && !unmap_factor && + (addr + mm->cached_hole_size) < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; /* try just below the current vma->vm_start */ @@ -352,6 +360,18 @@ static unsigned long slice_find_area_top } /* + * Using the next-fit algorithm, it is possible we started + * searching below usable address space holes. Go back to the + * top and start over. + */ + if (unmap_factor && firsttime) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + firsttime = 0; + goto again; + } + + /* * A failed mmap() very likely causes application failure, * so fall back to the bottom-up function here. This scenario * can happen with large stack limits and large mmap() @@ -364,7 +384,8 @@ static unsigned long slice_find_area_top */ if (use_cache) { mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; + if (likely(!unmap_factor)) + mm->cached_hole_size = ~0UL; } return addr; @@ -424,7 +445,7 @@ unsigned long slice_get_unmapped_area(un if (fixed && (addr & ((1ul << pshift) - 1))) return -EINVAL; if (fixed && addr > (mm->task_size - len)) - return -EINVAL; + return -ENOMEM; /* If hint, make sure it matches our alignment restrictions */ if (!fixed && addr) { diff -uprN linux-2.6.32/arch/powerpc/oprofile/op_model_power4.c linux-2.6.32.ovz/arch/powerpc/oprofile/op_model_power4.c --- linux-2.6.32/arch/powerpc/oprofile/op_model_power4.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/oprofile/op_model_power4.c 2015-03-10 13:23:53.000000000 -0700 @@ -22,6 +22,13 @@ #include #define dbg(args...) +#define OPROFILE_PM_PMCSEL_MSK 0xffULL +#define OPROFILE_PM_UNIT_SHIFT 60 +#define OPROFILE_PM_UNIT_MSK 0xfULL +#define OPROFILE_MAX_PMC_NUM 3 +#define OPROFILE_PMSEL_FIELD_WIDTH 8 +#define OPROFILE_UNIT_FIELD_WIDTH 4 +#define MMCRA_SIAR_VALID_MASK 0x10000000ULL static unsigned long reset_value[OP_MAX_COUNTER]; @@ -32,6 +39,61 @@ static int use_slot_nums; static u32 mmcr0_val; static u64 mmcr1_val; static u64 mmcra_val; +static u32 cntr_marked_events; + +static int power7_marked_instr_event(u64 mmcr1) +{ + u64 psel, unit; + int pmc, cntr_marked_events = 0; + + /* Given the MMCR1 value, look at the field for each counter to + * determine if it is a marked event. Code based on the function + * power7_marked_instr_event() in file arch/powerpc/perf/power7-pmu.c. + */ + for (pmc = 0; pmc < 4; pmc++) { + psel = mmcr1 & (OPROFILE_PM_PMCSEL_MSK + << (OPROFILE_MAX_PMC_NUM - pmc) + * OPROFILE_PMSEL_FIELD_WIDTH); + psel = (psel >> ((OPROFILE_MAX_PMC_NUM - pmc) + * OPROFILE_PMSEL_FIELD_WIDTH)) & ~1ULL; + unit = mmcr1 & (OPROFILE_PM_UNIT_MSK + << (OPROFILE_PM_UNIT_SHIFT + - (pmc * OPROFILE_PMSEL_FIELD_WIDTH))); + unit = unit >> (OPROFILE_PM_UNIT_SHIFT + - (pmc * OPROFILE_PMSEL_FIELD_WIDTH)); + + switch (psel >> 4) { + case 2: + cntr_marked_events |= (pmc == 1 || pmc == 3) << pmc; + break; + case 3: + if (psel == 0x3c) { + cntr_marked_events |= (pmc == 0) << pmc; + break; + } + + if (psel == 0x3e) { + cntr_marked_events |= (pmc != 1) << pmc; + break; + } + + cntr_marked_events |= 1 << pmc; + break; + case 4: + case 5: + cntr_marked_events |= (unit == 0xd) << pmc; + break; + case 6: + if (psel == 0x64) + cntr_marked_events |= (pmc >= 2) << pmc; + break; + case 8: + cntr_marked_events |= (unit == 0xd) << pmc; + break; + } + } + return cntr_marked_events; +} static int power4_reg_setup(struct op_counter_config *ctr, struct op_system_config *sys, @@ -48,6 +110,23 @@ static int power4_reg_setup(struct op_co mmcr1_val = sys->mmcr1; mmcra_val = sys->mmcra; + /* Power 7+ and newer architectures: + * Determine which counter events in the group (the group of events is + * specified by the bit settings in the MMCR1 register) are marked + * events for use in the interrupt handler. Do the calculation once + * before OProfile starts. Information is used in the interrupt + * handler. Starting with Power 7+ we only record the sample for + * marked events if the SIAR valid bit is set. For non marked events + * the sample is always recorded. + */ + if (__is_processor(PVR_POWER7p)) + cntr_marked_events = power7_marked_instr_event(mmcr1_val); + else + cntr_marked_events = 0; /* For older processors, set the bit map + * to zero so the sample will always be + * be recorded. + */ + for (i = 0; i < cur_cpu_spec->num_pmcs; ++i) reset_value[i] = 0x80000000UL - ctr[i].count; @@ -261,6 +340,28 @@ static int get_kernel(unsigned long pc, return is_kernel; } +static bool pmc_overflow(unsigned long val) +{ + if ((int)val < 0) + return true; + + /* + * Events on POWER7 can roll back if a speculative event doesn't + * eventually complete. Unfortunately in some rare cases they will + * raise a performance monitor exception. We need to catch this to + * ensure we reset the PMC. In all cases the PMC will be 256 or less + * cycles from overflow. + * + * We only do this if the first pass fails to find any overflowing + * PMCs because a user might set a period of less than 256 and we + * don't want to mistakenly reset them. + */ + if (__is_processor(PV_POWER7) && ((0x80000000 - val) <= 256)) + return true; + + return false; +} + static void power4_handle_interrupt(struct pt_regs *regs, struct op_counter_config *ctr) { @@ -270,6 +371,7 @@ static void power4_handle_interrupt(stru int i; unsigned int mmcr0; unsigned long mmcra; + bool siar_valid = false; mmcra = mfspr(SPRN_MMCRA); @@ -279,11 +381,29 @@ static void power4_handle_interrupt(stru /* set the PMM bit (see comment below) */ mtmsrd(mfmsr() | MSR_PMM); + /* Check that the SIAR valid bit in MMCRA is set to 1. */ + if ((mmcra & MMCRA_SIAR_VALID_MASK) == MMCRA_SIAR_VALID_MASK) + siar_valid = true; + for (i = 0; i < cur_cpu_spec->num_pmcs; ++i) { val = classic_ctr_read(i); - if (val < 0) { + if (pmc_overflow(val)) { if (oprofile_running && ctr[i].enabled) { - oprofile_add_ext_sample(pc, regs, i, is_kernel); + /* Power 7+ and newer architectures: + * If the event is a marked event, then only + * save the sample if the SIAR valid bit is + * set. If the event is not marked, then + * always save the sample. + * Note, the Sample enable bit in the MMCRA + * register must be set to 1 if the group + * contains a marked event. + */ + if ((siar_valid && + (cntr_marked_events & (1 << i))) + || !(cntr_marked_events & (1 << i))) + oprofile_add_ext_sample(pc, regs, i, + is_kernel); + classic_ctr_write(i, reset_value[i]); } else { classic_ctr_write(i, 0); diff -uprN linux-2.6.32/arch/powerpc/platforms/40x/ep405.c linux-2.6.32.ovz/arch/powerpc/platforms/40x/ep405.c --- linux-2.6.32/arch/powerpc/platforms/40x/ep405.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/40x/ep405.c 2015-03-10 13:23:53.000000000 -0700 @@ -100,7 +100,7 @@ static void __init ep405_setup_arch(void /* Find & init the BCSR CPLD */ ep405_init_bcsr(); - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); } static int __init ep405_probe(void) diff -uprN linux-2.6.32/arch/powerpc/platforms/40x/ppc40x_simple.c linux-2.6.32.ovz/arch/powerpc/platforms/40x/ppc40x_simple.c --- linux-2.6.32/arch/powerpc/platforms/40x/ppc40x_simple.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/40x/ppc40x_simple.c 2015-03-10 13:23:53.000000000 -0700 @@ -65,7 +65,7 @@ static int __init ppc40x_probe(void) for (i = 0; i < ARRAY_SIZE(board); i++) { if (of_flat_dt_is_compatible(root, board[i])) { - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } } diff -uprN linux-2.6.32/arch/powerpc/platforms/40x/walnut.c linux-2.6.32.ovz/arch/powerpc/platforms/40x/walnut.c --- linux-2.6.32/arch/powerpc/platforms/40x/walnut.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/40x/walnut.c 2015-03-10 13:23:53.000000000 -0700 @@ -51,7 +51,7 @@ static int __init walnut_probe(void) if (!of_flat_dt_is_compatible(root, "ibm,walnut")) return 0; - ppc_pci_flags = PPC_PCI_REASSIGN_ALL_RSRC; + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } diff -uprN linux-2.6.32/arch/powerpc/platforms/44x/ebony.c linux-2.6.32.ovz/arch/powerpc/platforms/44x/ebony.c --- linux-2.6.32/arch/powerpc/platforms/44x/ebony.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/44x/ebony.c 2015-03-10 13:23:53.000000000 -0700 @@ -54,7 +54,7 @@ static int __init ebony_probe(void) if (!of_flat_dt_is_compatible(root, "ibm,ebony")) return 0; - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } diff -uprN linux-2.6.32/arch/powerpc/platforms/44x/ppc44x_simple.c linux-2.6.32.ovz/arch/powerpc/platforms/44x/ppc44x_simple.c --- linux-2.6.32/arch/powerpc/platforms/44x/ppc44x_simple.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/44x/ppc44x_simple.c 2015-03-10 13:23:53.000000000 -0700 @@ -71,7 +71,7 @@ static int __init ppc44x_probe(void) for (i = 0; i < ARRAY_SIZE(board); i++) { if (of_flat_dt_is_compatible(root, board[i])) { - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } } diff -uprN linux-2.6.32/arch/powerpc/platforms/44x/sam440ep.c linux-2.6.32.ovz/arch/powerpc/platforms/44x/sam440ep.c --- linux-2.6.32/arch/powerpc/platforms/44x/sam440ep.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/44x/sam440ep.c 2015-03-10 13:23:53.000000000 -0700 @@ -51,7 +51,7 @@ static int __init sam440ep_probe(void) if (!of_flat_dt_is_compatible(root, "acube,sam440ep")) return 0; - ppc_pci_set_flags(PPC_PCI_REASSIGN_ALL_RSRC); + pci_set_flags(PCI_REASSIGN_ALL_RSRC); return 1; } diff -uprN linux-2.6.32/arch/powerpc/platforms/52xx/mpc52xx_pci.c linux-2.6.32.ovz/arch/powerpc/platforms/52xx/mpc52xx_pci.c --- linux-2.6.32/arch/powerpc/platforms/52xx/mpc52xx_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/52xx/mpc52xx_pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -371,7 +371,7 @@ mpc52xx_add_bridge(struct device_node *n pr_debug("Adding MPC52xx PCI host bridge %s\n", node->full_name); - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); if (of_address_to_resource(node, 0, &rsrc) != 0) { printk(KERN_ERR "Can't get %s resources\n", node->full_name); diff -uprN linux-2.6.32/arch/powerpc/platforms/82xx/pq2.c linux-2.6.32.ovz/arch/powerpc/platforms/82xx/pq2.c --- linux-2.6.32/arch/powerpc/platforms/82xx/pq2.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/82xx/pq2.c 2015-03-10 13:23:53.000000000 -0700 @@ -53,7 +53,7 @@ static void __init pq2_pci_add_bridge(st if (of_address_to_resource(np, 0, &r) || r.end - r.start < 0x10b) goto err; - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); hose = pcibios_alloc_controller(np); if (!hose) diff -uprN linux-2.6.32/arch/powerpc/platforms/cell/iommu.c linux-2.6.32.ovz/arch/powerpc/platforms/cell/iommu.c --- linux-2.6.32/arch/powerpc/platforms/cell/iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/cell/iommu.c 2015-03-10 13:23:58.000000000 -0700 @@ -476,7 +476,7 @@ cell_iommu_setup_window(struct cbe_iommu ioid = cell_iommu_get_ioid(np); - window = kmalloc_node(sizeof(*window), GFP_KERNEL, iommu->nid); + window = kzalloc_node(sizeof(*window), GFP_KERNEL, iommu->nid); BUG_ON(window == NULL); window->offset = offset; @@ -518,7 +518,6 @@ cell_iommu_setup_window(struct cbe_iommu __set_bit(0, window->table.it_map); tce_build_cell(&window->table, window->table.it_offset, 1, (unsigned long)iommu->pad_page, DMA_TO_DEVICE, NULL); - window->table.it_hint = window->table.it_blocksize; return window; } diff -uprN linux-2.6.32/arch/powerpc/platforms/cell/spufs/syscalls.c linux-2.6.32.ovz/arch/powerpc/platforms/cell/spufs/syscalls.c --- linux-2.6.32/arch/powerpc/platforms/cell/spufs/syscalls.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/cell/spufs/syscalls.c 2015-03-10 13:23:54.000000000 -0700 @@ -61,7 +61,7 @@ out: static long do_spu_create(const char __user *pathname, unsigned int flags, mode_t mode, struct file *neighbor) { - char *tmp; + struct filename *tmp; int ret; tmp = getname(pathname); @@ -69,7 +69,7 @@ static long do_spu_create(const char __u if (!IS_ERR(tmp)) { struct nameidata nd; - ret = path_lookup(tmp, LOOKUP_PARENT, &nd); + ret = path_lookup(tmp->name, LOOKUP_PARENT, &nd); if (!ret) { nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE; ret = spufs_create(&nd, flags, mode, neighbor); diff -uprN linux-2.6.32/arch/powerpc/platforms/chrp/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/chrp/pci.c --- linux-2.6.32/arch/powerpc/platforms/chrp/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/chrp/pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -199,7 +199,7 @@ static void __init setup_peg2(struct pci printk ("RTAS supporting Pegasos OF not found, please upgrade" " your firmware\n"); } - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); /* keep the reference to the root node */ } diff -uprN linux-2.6.32/arch/powerpc/platforms/fsl_uli1575.c linux-2.6.32.ovz/arch/powerpc/platforms/fsl_uli1575.c --- linux-2.6.32/arch/powerpc/platforms/fsl_uli1575.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/fsl_uli1575.c 2015-03-10 13:22:50.000000000 -0700 @@ -222,6 +222,7 @@ static void __devinit quirk_final_uli524 int i; u8 *dummy; struct pci_bus *bus = dev->bus; + struct resource *res; resource_size_t end = 0; for (i = PCI_BRIDGE_RESOURCES; i < PCI_BRIDGE_RESOURCES+3; i++) { @@ -230,13 +231,12 @@ static void __devinit quirk_final_uli524 end = pci_resource_end(dev, i); } - for (i = 0; i < PCI_BUS_NUM_RESOURCES; i++) { - if ((bus->resource[i]) && - (bus->resource[i]->flags & IORESOURCE_MEM)) { - if (bus->resource[i]->end == end) - dummy = ioremap(bus->resource[i]->start, 0x4); + pci_bus_for_each_resource(bus, res, i) { + if (res && res->flags & IORESOURCE_MEM) { + if (res->end == end) + dummy = ioremap(res->start, 0x4); else - dummy = ioremap(bus->resource[i]->end - 3, 0x4); + dummy = ioremap(res->end - 3, 0x4); if (dummy) { in_8(dummy); iounmap(dummy); diff -uprN linux-2.6.32/arch/powerpc/platforms/iseries/iommu.c linux-2.6.32.ovz/arch/powerpc/platforms/iseries/iommu.c --- linux-2.6.32/arch/powerpc/platforms/iseries/iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/iseries/iommu.c 2015-03-10 13:22:46.000000000 -0700 @@ -183,7 +183,7 @@ static void pci_dma_dev_setup_iseries(st BUG_ON(lsn == NULL); - tbl = kmalloc(sizeof(struct iommu_table), GFP_KERNEL); + tbl = kzalloc(sizeof(struct iommu_table), GFP_KERNEL); iommu_table_getparms_iSeries(pdn->busno, *lsn, 0, tbl); diff -uprN linux-2.6.32/arch/powerpc/platforms/iseries/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/iseries/pci.c --- linux-2.6.32/arch/powerpc/platforms/iseries/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/iseries/pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -868,7 +868,7 @@ void __init iSeries_pcibios_init(void) /* Install IO hooks */ ppc_pci_io = iseries_pci_io; - pci_probe_only = 1; + pci_add_flags(PCI_PROBE_ONLY); /* iSeries has no IO space in the common sense, it needs to set * the IO base to 0 diff -uprN linux-2.6.32/arch/powerpc/platforms/maple/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/maple/pci.c --- linux-2.6.32/arch/powerpc/platforms/maple/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/maple/pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -565,7 +565,7 @@ void __init maple_pci_init(void) } /* Tell pci.c to not change any resource allocations. */ - pci_probe_only = 1; + pci_add_flags(PCI_PROBE_ONLY); } int maple_pci_get_legacy_ide_irq(struct pci_dev *pdev, int channel) diff -uprN linux-2.6.32/arch/powerpc/platforms/pasemi/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/pasemi/pci.c --- linux-2.6.32/arch/powerpc/platforms/pasemi/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pasemi/pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -231,7 +231,7 @@ void __init pas_pci_init(void) pci_devs_phb_init(); /* Use the common resource allocation mechanism */ - pci_probe_only = 1; + pci_add_flags(PCI_PROBE_ONLY); } void __iomem *pasemi_pci_getcfgaddr(struct pci_dev *dev, int offset) diff -uprN linux-2.6.32/arch/powerpc/platforms/powermac/pci.c linux-2.6.32.ovz/arch/powerpc/platforms/powermac/pci.c --- linux-2.6.32/arch/powerpc/platforms/powermac/pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/powermac/pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -731,7 +731,7 @@ static void __init setup_bandit(struct p static int __init setup_uninorth(struct pci_controller *hose, struct resource *addr) { - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); has_uninorth = 1; hose->ops = ¯isc_pci_ops; hose->cfg_addr = ioremap(addr->start + 0x800000, 0x1000); @@ -998,7 +998,7 @@ void __init pmac_pci_init(void) struct device_node *np, *root; struct device_node *ht = NULL; - ppc_pci_set_flags(PPC_PCI_CAN_SKIP_ISA_ALIGN); + pci_set_flags(PCI_CAN_SKIP_ISA_ALIGN); root = of_find_node_by_path("/"); if (root == NULL) { @@ -1045,9 +1045,6 @@ void __init pmac_pci_init(void) } /* pmac_check_ht_link(); */ - /* We can allocate missing resources if any */ - pci_probe_only = 0; - #else /* CONFIG_PPC64 */ init_p2pbridge(); init_second_ohare(); @@ -1057,7 +1054,7 @@ void __init pmac_pci_init(void) * some offset between bus number and domains for now when we * assign all busses should help for now */ - if (ppc_pci_has_flag(PPC_PCI_REASSIGN_ALL_BUS)) + if (pci_has_flag(PCI_REASSIGN_ALL_BUS)) pcibios_assign_bus_offset = 0x10; #endif } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/cmm.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/cmm.c --- linux-2.6.32/arch/powerpc/platforms/pseries/cmm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/cmm.c 2015-03-10 13:22:05.000000000 -0700 @@ -38,19 +38,28 @@ #include #include #include +#include #include "plpar_wrappers.h" #define CMM_DRIVER_VERSION "1.0.0" #define CMM_DEFAULT_DELAY 1 +#define CMM_HOTPLUG_DELAY 5 #define CMM_DEBUG 0 #define CMM_DISABLE 0 #define CMM_OOM_KB 1024 #define CMM_MIN_MEM_MB 256 #define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10)) #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) +/* + * The priority level tries to ensure that this notifier is called as + * late as possible to reduce thrashing in the shared memory pool. + */ +#define CMM_MEM_HOTPLUG_PRI 1 +#define CMM_MEM_ISOLATE_PRI 15 static unsigned int delay = CMM_DEFAULT_DELAY; +static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY; static unsigned int oom_kb = CMM_OOM_KB; static unsigned int cmm_debug = CMM_DEBUG; static unsigned int cmm_disabled = CMM_DISABLE; @@ -65,6 +74,10 @@ MODULE_VERSION(CMM_DRIVER_VERSION); module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. " "[Default=" __stringify(CMM_DEFAULT_DELAY) "]"); +module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove " + "before loaning resumes. " + "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]"); module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. " "[Default=" __stringify(CMM_OOM_KB) "]"); @@ -92,6 +105,9 @@ static unsigned long oom_freed_pages; static struct cmm_page_array *cmm_page_list; static DEFINE_SPINLOCK(cmm_lock); +static DEFINE_MUTEX(hotplug_mutex); +static int hotplug_occurred; /* protected by the hotplug mutex */ + static struct task_struct *cmm_thread_ptr; /** @@ -110,6 +126,17 @@ static long cmm_alloc_pages(long nr) cmm_dbg("Begin request for %ld pages\n", nr); while (nr) { + /* Exit if a hotplug operation is in progress or occurred */ + if (mutex_trylock(&hotplug_mutex)) { + if (hotplug_occurred) { + mutex_unlock(&hotplug_mutex); + break; + } + mutex_unlock(&hotplug_mutex); + } else { + break; + } + addr = __get_free_page(GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC); if (!addr) @@ -119,8 +146,9 @@ static long cmm_alloc_pages(long nr) if (!pa || pa->index >= CMM_NR_PAGES) { /* Need a new page for the page list. */ spin_unlock(&cmm_lock); - npa = (struct cmm_page_array *)__get_free_page(GFP_NOIO | __GFP_NOWARN | - __GFP_NORETRY | __GFP_NOMEMALLOC); + npa = (struct cmm_page_array *)__get_free_page( + GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY | __GFP_NOMEMALLOC); if (!npa) { pr_info("%s: Can not allocate new page list\n", __func__); free_page(addr); @@ -229,8 +257,9 @@ static void cmm_get_mpp(void) { int rc; struct hvcall_mpp_data mpp_data; - unsigned long active_pages_target; - signed long page_loan_request; + signed long active_pages_target, page_loan_request, target; + signed long total_pages = totalram_pages + loaned_pages; + signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; rc = h_get_mpp(&mpp_data); @@ -238,17 +267,25 @@ static void cmm_get_mpp(void) return; page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE); - loaned_pages_target = page_loan_request + loaned_pages; - if (loaned_pages_target > oom_freed_pages) - loaned_pages_target -= oom_freed_pages; + target = page_loan_request + (signed long)loaned_pages; + + if (target < 0 || total_pages < min_mem_pages) + target = 0; + + if (target > oom_freed_pages) + target -= oom_freed_pages; else - loaned_pages_target = 0; + target = 0; + + active_pages_target = total_pages - target; - active_pages_target = totalram_pages + loaned_pages - loaned_pages_target; + if (min_mem_pages > active_pages_target) + target = total_pages - min_mem_pages; - if ((min_mem_mb * 1024 * 1024) > (active_pages_target * PAGE_SIZE)) - loaned_pages_target = totalram_pages + loaned_pages - - ((min_mem_mb * 1024 * 1024) / PAGE_SIZE); + if (target < 0) + target = 0; + + loaned_pages_target = target; cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", page_loan_request, loaned_pages, loaned_pages_target, @@ -273,9 +310,28 @@ static int cmm_thread(void *dummy) while (1) { timeleft = msleep_interruptible(delay * 1000); - if (kthread_should_stop() || timeleft) { - loaned_pages_target = loaned_pages; + if (kthread_should_stop() || timeleft) break; + + if (mutex_trylock(&hotplug_mutex)) { + if (hotplug_occurred) { + hotplug_occurred = 0; + mutex_unlock(&hotplug_mutex); + cmm_dbg("Hotplug operation has occurred, " + "loaning activity suspended " + "for %d seconds.\n", + hotplug_delay); + timeleft = msleep_interruptible(hotplug_delay * + 1000); + if (kthread_should_stop() || timeleft) + break; + continue; + } + mutex_unlock(&hotplug_mutex); + } else { + cmm_dbg("Hotplug operation in progress, activity " + "suspended\n"); + continue; } cmm_get_mpp(); @@ -405,6 +461,193 @@ static struct notifier_block cmm_reboot_ }; /** + * cmm_count_pages - Count the number of pages loaned in a particular range. + * + * @arg: memory_isolate_notify structure with address range and count + * + * Return value: + * 0 on success + **/ +static unsigned long cmm_count_pages(void *arg) +{ + struct memory_isolate_notify *marg = arg; + struct cmm_page_array *pa; + unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn); + unsigned long end = start + (marg->nr_pages << PAGE_SHIFT); + unsigned long idx; + + spin_lock(&cmm_lock); + pa = cmm_page_list; + while (pa) { + if ((unsigned long)pa >= start && (unsigned long)pa < end) + marg->pages_found++; + for (idx = 0; idx < pa->index; idx++) + if (pa->page[idx] >= start && pa->page[idx] < end) + marg->pages_found++; + pa = pa->next; + } + spin_unlock(&cmm_lock); + return 0; +} + +/** + * cmm_memory_isolate_cb - Handle memory isolation notifier calls + * @self: notifier block struct + * @action: action to take + * @arg: struct memory_isolate_notify data for handler + * + * Return value: + * NOTIFY_OK or notifier error based on subfunction return value + **/ +static int cmm_memory_isolate_cb(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + if (action == MEM_ISOLATE_COUNT) + ret = cmm_count_pages(arg); + + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + + return ret; +} + +static struct notifier_block cmm_mem_isolate_nb = { + .notifier_call = cmm_memory_isolate_cb, + .priority = CMM_MEM_ISOLATE_PRI +}; + +/** + * cmm_mem_going_offline - Unloan pages where memory is to be removed + * @arg: memory_notify structure with page range to be offlined + * + * Return value: + * 0 on success + **/ +static int cmm_mem_going_offline(void *arg) +{ + struct memory_notify *marg = arg; + unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn); + unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT); + struct cmm_page_array *pa_curr, *pa_last, *npa; + unsigned long idx; + unsigned long freed = 0; + + cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n", + start_page, marg->nr_pages); + spin_lock(&cmm_lock); + + /* Search the page list for pages in the range to be offlined */ + pa_last = pa_curr = cmm_page_list; + while (pa_curr) { + for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) { + if ((pa_curr->page[idx] < start_page) || + (pa_curr->page[idx] >= end_page)) + continue; + + plpar_page_set_active(__pa(pa_curr->page[idx])); + free_page(pa_curr->page[idx]); + freed++; + loaned_pages--; + totalram_pages++; + pa_curr->page[idx] = pa_last->page[--pa_last->index]; + if (pa_last->index == 0) { + if (pa_curr == pa_last) + pa_curr = pa_last->next; + pa_last = pa_last->next; + free_page((unsigned long)cmm_page_list); + cmm_page_list = pa_last; + continue; + } + } + pa_curr = pa_curr->next; + } + + /* Search for page list structures in the range to be offlined */ + pa_last = NULL; + pa_curr = cmm_page_list; + while (pa_curr) { + if (((unsigned long)pa_curr >= start_page) && + ((unsigned long)pa_curr < end_page)) { + npa = (struct cmm_page_array *)__get_free_page( + GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY | __GFP_NOMEMALLOC); + if (!npa) { + spin_unlock(&cmm_lock); + cmm_dbg("Failed to allocate memory for list " + "management. Memory hotplug " + "failed.\n"); + return ENOMEM; + } + memcpy(npa, pa_curr, PAGE_SIZE); + if (pa_curr == cmm_page_list) + cmm_page_list = npa; + if (pa_last) + pa_last->next = npa; + free_page((unsigned long) pa_curr); + freed++; + pa_curr = npa; + } + + pa_last = pa_curr; + pa_curr = pa_curr->next; + } + + spin_unlock(&cmm_lock); + cmm_dbg("Released %ld pages in the search range.\n", freed); + + return 0; +} + +/** + * cmm_memory_cb - Handle memory hotplug notifier calls + * @self: notifier block struct + * @action: action to take + * @arg: struct memory_notify data for handler + * + * Return value: + * NOTIFY_OK or notifier error based on subfunction return value + * + **/ +static int cmm_memory_cb(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + switch (action) { + case MEM_GOING_OFFLINE: + mutex_lock(&hotplug_mutex); + hotplug_occurred = 1; + ret = cmm_mem_going_offline(arg); + break; + case MEM_OFFLINE: + case MEM_CANCEL_OFFLINE: + mutex_unlock(&hotplug_mutex); + cmm_dbg("Memory offline operation complete.\n"); + break; + case MEM_GOING_ONLINE: + case MEM_ONLINE: + case MEM_CANCEL_ONLINE: + break; + } + + if (ret) + ret = notifier_from_errno(ret); + else + ret = NOTIFY_OK; + + return ret; +} + +static struct notifier_block cmm_mem_nb = { + .notifier_call = cmm_memory_cb, + .priority = CMM_MEM_HOTPLUG_PRI +}; + +/** * cmm_init - Module initialization * * Return value: @@ -426,18 +669,24 @@ static int cmm_init(void) if ((rc = cmm_sysfs_register(&cmm_sysdev))) goto out_reboot_notifier; + if (register_memory_notifier(&cmm_mem_nb) || + register_memory_isolate_notifier(&cmm_mem_isolate_nb)) + goto out_unregister_notifier; + if (cmm_disabled) return rc; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); if (IS_ERR(cmm_thread_ptr)) { rc = PTR_ERR(cmm_thread_ptr); - goto out_unregister_sysfs; + goto out_unregister_notifier; } return rc; -out_unregister_sysfs: +out_unregister_notifier: + unregister_memory_notifier(&cmm_mem_nb); + unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); cmm_unregister_sysfs(&cmm_sysdev); out_reboot_notifier: unregister_reboot_notifier(&cmm_reboot_nb); @@ -458,6 +707,8 @@ static void cmm_exit(void) kthread_stop(cmm_thread_ptr); unregister_oom_notifier(&cmm_oom_nb); unregister_reboot_notifier(&cmm_reboot_nb); + unregister_memory_notifier(&cmm_mem_nb); + unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); cmm_free_pages(loaned_pages); cmm_unregister_sysfs(&cmm_sysdev); } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/dlpar.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/dlpar.c --- linux-2.6.32/arch/powerpc/platforms/pseries/dlpar.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/dlpar.c 2015-03-10 13:21:41.000000000 -0700 @@ -0,0 +1,563 @@ +/* + * Support for dynamic reconfiguration for PCI, Memory, and CPU + * Hotplug and Dynamic Logical Partitioning on RPA platforms. + * + * Copyright (C) 2009 Nathan Fontenot + * Copyright (C) 2009 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include "offline_states.h" + +#include +#include +#include +#include +#include + +struct cc_workarea { + u32 drc_index; + u32 zero; + u32 name_offset; + u32 prop_length; + u32 prop_offset; +}; + +static void dlpar_free_cc_property(struct property *prop) +{ + kfree(prop->name); + kfree(prop->value); + kfree(prop); +} + +static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa) +{ + struct property *prop; + char *name; + char *value; + + prop = kzalloc(sizeof(*prop), GFP_KERNEL); + if (!prop) + return NULL; + + name = (char *)ccwa + ccwa->name_offset; + prop->name = kstrdup(name, GFP_KERNEL); + if (!prop->name) { + dlpar_free_cc_property(prop); + return NULL; + } + + prop->length = ccwa->prop_length; + value = (char *)ccwa + ccwa->prop_offset; + prop->value = kzalloc(prop->length, GFP_KERNEL); + if (!prop->value) { + dlpar_free_cc_property(prop); + return NULL; + } + + memcpy(prop->value, value, prop->length); + return prop; +} + +static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa) +{ + struct device_node *dn; + char *name; + + dn = kzalloc(sizeof(*dn), GFP_KERNEL); + if (!dn) + return NULL; + + /* The configure connector reported name does not contain a + * preceeding '/', so we allocate a buffer large enough to + * prepend this to the full_name. + */ + name = (char *)ccwa + ccwa->name_offset; + dn->full_name = kmalloc(strlen(name) + 2, GFP_KERNEL); + if (!dn->full_name) { + kfree(dn); + return NULL; + } + + sprintf(dn->full_name, "/%s", name); + return dn; +} + +static void dlpar_free_one_cc_node(struct device_node *dn) +{ + struct property *prop; + + while (dn->properties) { + prop = dn->properties; + dn->properties = prop->next; + dlpar_free_cc_property(prop); + } + + kfree(dn->full_name); + kfree(dn); +} + +static void dlpar_free_cc_nodes(struct device_node *dn) +{ + if (dn->child) + dlpar_free_cc_nodes(dn->child); + + if (dn->sibling) + dlpar_free_cc_nodes(dn->sibling); + + dlpar_free_one_cc_node(dn); +} + +#define NEXT_SIBLING 1 +#define NEXT_CHILD 2 +#define NEXT_PROPERTY 3 +#define PREV_PARENT 4 +#define MORE_MEMORY 5 +#define CALL_AGAIN -2 +#define ERR_CFG_USE -9003 + +struct device_node *dlpar_configure_connector(u32 drc_index) +{ + struct device_node *dn; + struct device_node *first_dn = NULL; + struct device_node *last_dn = NULL; + struct property *property; + struct property *last_property = NULL; + struct cc_workarea *ccwa; + int cc_token; + int rc; + + cc_token = rtas_token("ibm,configure-connector"); + if (cc_token == RTAS_UNKNOWN_SERVICE) + return NULL; + + spin_lock(&rtas_data_buf_lock); + ccwa = (struct cc_workarea *)&rtas_data_buf[0]; + ccwa->drc_index = drc_index; + ccwa->zero = 0; + + rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); + while (rc) { + switch (rc) { + case NEXT_SIBLING: + dn = dlpar_parse_cc_node(ccwa); + if (!dn) + goto cc_error; + + dn->parent = last_dn->parent; + last_dn->sibling = dn; + last_dn = dn; + break; + + case NEXT_CHILD: + dn = dlpar_parse_cc_node(ccwa); + if (!dn) + goto cc_error; + + if (!first_dn) + first_dn = dn; + else { + dn->parent = last_dn; + if (last_dn) + last_dn->child = dn; + } + + last_dn = dn; + break; + + case NEXT_PROPERTY: + property = dlpar_parse_cc_property(ccwa); + if (!property) + goto cc_error; + + if (!last_dn->properties) + last_dn->properties = property; + else + last_property->next = property; + + last_property = property; + break; + + case PREV_PARENT: + last_dn = last_dn->parent; + break; + + case CALL_AGAIN: + break; + + case MORE_MEMORY: + case ERR_CFG_USE: + default: + printk(KERN_ERR "Unexpected Error (%d) " + "returned from configure-connector\n", rc); + goto cc_error; + } + + rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); + } + + spin_unlock(&rtas_data_buf_lock); + return first_dn; + +cc_error: + if (first_dn) + dlpar_free_cc_nodes(first_dn); + spin_unlock(&rtas_data_buf_lock); + return NULL; +} + +static struct device_node *derive_parent(const char *path) +{ + struct device_node *parent; + char *last_slash; + + last_slash = strrchr(path, '/'); + if (last_slash == path) { + parent = of_find_node_by_path("/"); + } else { + char *parent_path; + int parent_path_len = last_slash - path + 1; + parent_path = kmalloc(parent_path_len, GFP_KERNEL); + if (!parent_path) + return NULL; + + strlcpy(parent_path, path, parent_path_len); + parent = of_find_node_by_path(parent_path); + kfree(parent_path); + } + + return parent; +} + +int dlpar_attach_node(struct device_node *dn) +{ + struct proc_dir_entry *ent; + int rc; + + of_node_set_flag(dn, OF_DYNAMIC); + kref_init(&dn->kref); + dn->parent = derive_parent(dn->full_name); + if (!dn->parent) + return -ENOMEM; + + rc = blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_ADD, dn); + if (rc == NOTIFY_BAD) { + printk(KERN_ERR "Failed to add device node %s\n", + dn->full_name); + return -ENOMEM; /* For now, safe to assume kmalloc failure */ + } + + of_attach_node(dn); + +#ifdef CONFIG_PROC_DEVICETREE + ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde); + if (ent) + proc_device_tree_add_node(dn, ent); +#endif + + of_node_put(dn->parent); + return 0; +} + +int dlpar_detach_node(struct device_node *dn) +{ + struct device_node *parent = dn->parent; + struct property *prop = dn->properties; + +#ifdef CONFIG_PROC_DEVICETREE + while (prop) { + remove_proc_entry(prop->name, dn->pde); + prop = prop->next; + } + + if (dn->pde) + remove_proc_entry(dn->pde->name, parent->pde); +#endif + + blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_REMOVE, dn); + of_detach_node(dn); + of_node_put(dn); /* Must decrement the refcount */ + + return 0; +} + +#define DR_ENTITY_SENSE 9003 +#define DR_ENTITY_PRESENT 1 +#define DR_ENTITY_UNUSABLE 2 +#define ALLOCATION_STATE 9003 +#define ALLOC_UNUSABLE 0 +#define ALLOC_USABLE 1 +#define ISOLATION_STATE 9001 +#define ISOLATE 0 +#define UNISOLATE 1 + +int dlpar_acquire_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_UNUSABLE) + return rc ? rc : -EIO; + + rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE); + if (rc) + return rc; + + rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + if (rc) { + rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE); + return rc; + } + + return 0; +} + +int dlpar_release_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_PRESENT) + return rc ? rc : -EIO; + + rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE); + if (rc) + return rc; + + rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE); + if (rc) { + rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + return rc; + } + + return 0; +} + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + +static DEFINE_MUTEX(pseries_cpu_hotplug_mutex); + +void cpu_hotplug_driver_lock(void) +__acquires(pseries_cpu_hotplug_mutex) +{ + mutex_lock(&pseries_cpu_hotplug_mutex); +} + +void cpu_hotplug_driver_unlock(void) +__releases(pseries_cpu_hotplug_mutex) +{ + mutex_unlock(&pseries_cpu_hotplug_mutex); +} + +static int dlpar_online_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + BUG_ON(get_cpu_current_state(cpu) + != CPU_STATE_OFFLINE); + cpu_maps_update_done(); + rc = cpu_up(cpu); + if (rc) + goto out; + cpu_maps_update_begin(); + + break; + } + if (cpu == num_possible_cpus()) + printk(KERN_WARNING "Could not find cpu to online " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); + +out: + return rc; + +} + +static ssize_t dlpar_cpu_probe(const char *buf, size_t count) +{ + struct device_node *dn; + unsigned long drc_index; + char *cpu_name; + int rc; + + cpu_hotplug_driver_lock(); + rc = strict_strtoul(buf, 0, &drc_index); + if (rc) { + rc = -EINVAL; + goto out; + } + + dn = dlpar_configure_connector(drc_index); + if (!dn) { + rc = -EINVAL; + goto out; + } + + /* configure-connector reports cpus as living in the base + * directory of the device tree. CPUs actually live in the + * cpus directory so we need to fixup the full_name. + */ + cpu_name = kasprintf(GFP_KERNEL, "/cpus%s", dn->full_name); + if (!cpu_name) { + dlpar_free_cc_nodes(dn); + rc = -ENOMEM; + goto out; + } + + kfree(dn->full_name); + dn->full_name = cpu_name; + + rc = dlpar_acquire_drc(drc_index); + if (rc) { + dlpar_free_cc_nodes(dn); + rc = -EINVAL; + goto out; + } + + rc = dlpar_attach_node(dn); + if (rc) { + dlpar_release_drc(drc_index); + dlpar_free_cc_nodes(dn); + } + + rc = dlpar_online_cpu(dn); +out: + cpu_hotplug_driver_unlock(); + + return rc ? rc : count; +} + +static int dlpar_offline_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + + if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) + break; + + if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { + set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); + cpu_maps_update_done(); + rc = cpu_down(cpu); + if (rc) + goto out; + cpu_maps_update_begin(); + break; + + } + + /* + * The cpu is in CPU_STATE_INACTIVE. + * Upgrade it's state to CPU_STATE_OFFLINE. + */ + set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); + BUG_ON(plpar_hcall_norets(H_PROD, intserv[i]) + != H_SUCCESS); + __cpu_die(cpu); + break; + } + if (cpu == num_possible_cpus()) + printk(KERN_WARNING "Could not find cpu to offline " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); + +out: + return rc; + +} + +static ssize_t dlpar_cpu_release(const char *buf, size_t count) +{ + struct device_node *dn; + const u32 *drc_index; + int rc; + + dn = of_find_node_by_path(buf); + if (!dn) + return -EINVAL; + + drc_index = of_get_property(dn, "ibm,my-drc-index", NULL); + if (!drc_index) { + of_node_put(dn); + return -EINVAL; + } + + cpu_hotplug_driver_lock(); + rc = dlpar_offline_cpu(dn); + if (rc) { + of_node_put(dn); + rc = -EINVAL; + goto out; + } + + rc = dlpar_release_drc(*drc_index); + if (rc) { + of_node_put(dn); + goto out; + } + + rc = dlpar_detach_node(dn); + if (rc) { + dlpar_acquire_drc(*drc_index); + goto out; + } + + of_node_put(dn); +out: + cpu_hotplug_driver_unlock(); + return rc ? rc : count; +} + +static int __init pseries_dlpar_init(void) +{ + ppc_md.cpu_probe = dlpar_cpu_probe; + ppc_md.cpu_release = dlpar_cpu_release; + + return 0; +} +machine_device_initcall(pseries, pseries_dlpar_init); + +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/dtl.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/dtl.c --- linux-2.6.32/arch/powerpc/platforms/pseries/dtl.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/dtl.c 2015-03-10 13:22:52.000000000 -0700 @@ -22,37 +22,22 @@ #include #include +#include #include #include #include #include +#include #include "plpar_wrappers.h" -/* - * Layout of entries in the hypervisor's DTL buffer. Although we don't - * actually access the internals of an entry (we only need to know the size), - * we might as well define it here for reference. - */ -struct dtl_entry { - u8 dispatch_reason; - u8 preempt_reason; - u16 processor_id; - u32 enqueue_to_dispatch_time; - u32 ready_to_enqueue_time; - u32 waiting_to_ready_time; - u64 timebase; - u64 fault_addr; - u64 srr0; - u64 srr1; -}; - struct dtl { struct dtl_entry *buf; struct dentry *file; int cpu; int buf_entries; u64 last_idx; + spinlock_t lock; }; static DEFINE_PER_CPU(struct dtl, dtl); @@ -71,25 +56,97 @@ static u8 dtl_event_mask = 0x7; static int dtl_buf_entries = (16 * 85); -static int dtl_enable(struct dtl *dtl) +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +struct dtl_ring { + u64 write_index; + struct dtl_entry *write_ptr; + struct dtl_entry *buf; + struct dtl_entry *buf_end; + u8 saved_dtl_mask; +}; + +static DEFINE_PER_CPU(struct dtl_ring, dtl_rings); + +static atomic_t dtl_count; + +/* + * The cpu accounting code controls the DTL ring buffer, and we get + * given entries as they are processed. + */ +static void consume_dtle(struct dtl_entry *dtle, u64 index) { - unsigned long addr; - int ret, hwcpu; + struct dtl_ring *dtlr = &__get_cpu_var(dtl_rings); + struct dtl_entry *wp = dtlr->write_ptr; + struct lppaca *vpa = local_paca->lppaca_ptr; + + if (!wp) + return; + + *wp = *dtle; + barrier(); + + /* check for hypervisor ring buffer overflow, ignore this entry if so */ + if (index + N_DISPATCH_LOG < vpa->dtl_idx) + return; + + ++wp; + if (wp == dtlr->buf_end) + wp = dtlr->buf; + dtlr->write_ptr = wp; - /* only allow one reader */ - if (dtl->buf) - return -EBUSY; + /* incrementing write_index makes the new entry visible */ + smp_wmb(); + ++dtlr->write_index; +} - /* we need to store the original allocation size for use during read */ - dtl->buf_entries = dtl_buf_entries; +static int dtl_start(struct dtl *dtl) +{ + struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu); - dtl->buf = kmalloc_node(dtl->buf_entries * sizeof(struct dtl_entry), - GFP_KERNEL, cpu_to_node(dtl->cpu)); - if (!dtl->buf) { - printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n", - __func__, dtl->cpu); - return -ENOMEM; - } + dtlr->buf = dtl->buf; + dtlr->buf_end = dtl->buf + dtl->buf_entries; + dtlr->write_index = 0; + + /* setting write_ptr enables logging into our buffer */ + smp_wmb(); + dtlr->write_ptr = dtl->buf; + + /* enable event logging */ + dtlr->saved_dtl_mask = lppaca[dtl->cpu].dtl_enable_mask; + lppaca[dtl->cpu].dtl_enable_mask |= dtl_event_mask; + + dtl_consumer = consume_dtle; + atomic_inc(&dtl_count); + return 0; +} + +static void dtl_stop(struct dtl *dtl) +{ + struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu); + + dtlr->write_ptr = NULL; + smp_wmb(); + + dtlr->buf = NULL; + + /* restore dtl_enable_mask */ + lppaca[dtl->cpu].dtl_enable_mask = dtlr->saved_dtl_mask; + + if (atomic_dec_and_test(&dtl_count)) + dtl_consumer = NULL; +} + +static u64 dtl_current_index(struct dtl *dtl) +{ + return per_cpu(dtl_rings, dtl->cpu).write_index; +} + +#else /* CONFIG_VIRT_CPU_ACCOUNTING */ + +static int dtl_start(struct dtl *dtl) +{ + unsigned long addr; + int ret, hwcpu; /* Register our dtl buffer with the hypervisor. The HV expects the * buffer size to be passed in the second word of the buffer */ @@ -101,12 +158,11 @@ static int dtl_enable(struct dtl *dtl) if (ret) { printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) " "failed with %d\n", __func__, dtl->cpu, hwcpu, ret); - kfree(dtl->buf); return -EIO; } /* set our initial buffer indices */ - dtl->last_idx = lppaca[dtl->cpu].dtl_idx = 0; + lppaca[dtl->cpu].dtl_idx = 0; /* ensure that our updates to the lppaca fields have occurred before * we actually enable the logging */ @@ -118,17 +174,67 @@ static int dtl_enable(struct dtl *dtl) return 0; } -static void dtl_disable(struct dtl *dtl) +static void dtl_stop(struct dtl *dtl) { int hwcpu = get_hard_smp_processor_id(dtl->cpu); lppaca[dtl->cpu].dtl_enable_mask = 0x0; - unregister_dtl(hwcpu, __pa(dtl->buf)); + unregister_dtl(hwcpu); +} + +static u64 dtl_current_index(struct dtl *dtl) +{ + return lppaca[dtl->cpu].dtl_idx; +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + +static int dtl_enable(struct dtl *dtl) +{ + long int n_entries; + long int rc; + struct dtl_entry *buf = NULL; + + /* only allow one reader */ + if (dtl->buf) + return -EBUSY; + + n_entries = dtl_buf_entries; + buf = kmalloc_node(n_entries * sizeof(struct dtl_entry), + GFP_KERNEL, cpu_to_node(dtl->cpu)); + if (!buf) { + printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n", + __func__, dtl->cpu); + return -ENOMEM; + } + + spin_lock(&dtl->lock); + rc = -EBUSY; + if (!dtl->buf) { + /* store the original allocation size for use during read */ + dtl->buf_entries = n_entries; + dtl->buf = buf; + dtl->last_idx = 0; + rc = dtl_start(dtl); + if (rc) + dtl->buf = NULL; + } + spin_unlock(&dtl->lock); + + if (rc) + kfree(buf); + return rc; +} + +static void dtl_disable(struct dtl *dtl) +{ + spin_lock(&dtl->lock); + dtl_stop(dtl); kfree(dtl->buf); dtl->buf = NULL; dtl->buf_entries = 0; + spin_unlock(&dtl->lock); } /* file interface */ @@ -156,8 +262,9 @@ static int dtl_file_release(struct inode static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len, loff_t *pos) { - int rc, cur_idx, last_idx, n_read, n_req, read_size; + long int rc, n_read, n_req, read_size; struct dtl *dtl; + u64 cur_idx, last_idx, i; if ((len % sizeof(struct dtl_entry)) != 0) return -EINVAL; @@ -170,41 +277,48 @@ static ssize_t dtl_file_read(struct file /* actual number of entries read */ n_read = 0; - cur_idx = lppaca[dtl->cpu].dtl_idx; + spin_lock(&dtl->lock); + + cur_idx = dtl_current_index(dtl); last_idx = dtl->last_idx; - if (cur_idx - last_idx > dtl->buf_entries) { - pr_debug("%s: hv buffer overflow for cpu %d, samples lost\n", - __func__, dtl->cpu); - } + if (last_idx + dtl->buf_entries <= cur_idx) + last_idx = cur_idx - dtl->buf_entries + 1; - cur_idx %= dtl->buf_entries; - last_idx %= dtl->buf_entries; + if (last_idx + n_req > cur_idx) + n_req = cur_idx - last_idx; + + if (n_req > 0) + dtl->last_idx = last_idx + n_req; + + spin_unlock(&dtl->lock); + + if (n_req <= 0) + return 0; + + i = last_idx % dtl->buf_entries; /* read the tail of the buffer if we've wrapped */ - if (last_idx > cur_idx) { - read_size = min(n_req, dtl->buf_entries - last_idx); + if (i + n_req > dtl->buf_entries) { + read_size = dtl->buf_entries - i; - rc = copy_to_user(buf, &dtl->buf[last_idx], + rc = copy_to_user(buf, &dtl->buf[i], read_size * sizeof(struct dtl_entry)); if (rc) return -EFAULT; - last_idx = 0; + i = 0; n_req -= read_size; n_read += read_size; buf += read_size * sizeof(struct dtl_entry); } /* .. and now the head */ - read_size = min(n_req, cur_idx - last_idx); - rc = copy_to_user(buf, &dtl->buf[last_idx], - read_size * sizeof(struct dtl_entry)); + rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry)); if (rc) return -EFAULT; - n_read += read_size; - dtl->last_idx += n_read; + n_read += n_req; return n_read * sizeof(struct dtl_entry); } @@ -236,9 +350,6 @@ static int dtl_init(void) struct dentry *event_mask_file, *buf_entries_file; int rc, i; - if (!firmware_has_feature(FW_FEATURE_SPLPAR)) - return -ENODEV; - /* set up common debugfs structure */ rc = -ENOMEM; @@ -262,6 +373,7 @@ static int dtl_init(void) /* set up the per-cpu log structures */ for_each_possible_cpu(i) { struct dtl *dtl = &per_cpu(dtl, i); + spin_lock_init(&dtl->lock); dtl->cpu = i; rc = dtl_setup_file(dtl); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/eeh.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh.c --- linux-2.6.32/arch/powerpc/platforms/pseries/eeh.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh.c 2015-03-10 13:23:53.000000000 -0700 @@ -95,6 +95,7 @@ static int ibm_slot_error_detail; static int ibm_get_config_addr_info; static int ibm_get_config_addr_info2; static int ibm_configure_bridge; +static int ibm_configure_pe; int eeh_subsystem_enabled; EXPORT_SYMBOL(eeh_subsystem_enabled); @@ -263,6 +264,8 @@ void eeh_slot_error_detail(struct pci_dn pci_regs_buf[0] = 0; rtas_pci_enable(pdn, EEH_THAW_MMIO); + rtas_configure_bridge(pdn); + eeh_restore_bars(pdn); loglen = gather_pci_data(pdn, pci_regs_buf, EEH_PCI_REGS_LOG_LEN); rtas_slot_error_detail(pdn, severity, pci_regs_buf, loglen); @@ -450,6 +453,39 @@ void eeh_clear_slot (struct device_node spin_unlock_irqrestore(&confirm_error_lock, flags); } +void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset) +{ + struct device_node *dn; + + for_each_child_of_node(parent, dn) { + if (PCI_DN(dn)) { + + struct pci_dev *dev = PCI_DN(dn)->pcidev; + + if (dev && dev->driver) + *freset |= dev->needs_freset; + + __eeh_set_pe_freset(dn, freset); + } + } +} + +void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset) +{ + struct pci_dev *dev; + dn = find_device_pe(dn); + + /* Back up one, since config addrs might be shared */ + if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) + dn = dn->parent; + + dev = PCI_DN(dn)->pcidev; + if (dev) + *freset |= dev->needs_freset; + + __eeh_set_pe_freset(dn, freset); +} + /** * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze * @dn device node @@ -491,7 +527,7 @@ int eeh_dn_check_failure(struct device_n pdn->eeh_mode & EEH_MODE_NOCHECK) { ignored_check++; pr_debug("EEH: Ignored check (%x) for %s %s\n", - pdn->eeh_mode, pci_name (dev), dn->full_name); + pdn->eeh_mode, eeh_pci_name(dev), dn->full_name); return 0; } @@ -515,9 +551,9 @@ int eeh_dn_check_failure(struct device_n printk (KERN_ERR "EEH: %d reads ignored for recovering device at " "location=%s driver=%s pci addr=%s\n", pdn->eeh_check_count, location, - dev->driver->name, pci_name(dev)); + eeh_driver_name(dev), eeh_pci_name(dev)); printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n", - dev->driver->name); + eeh_driver_name(dev)); dump_stack(); } goto dn_unlock; @@ -694,15 +730,24 @@ rtas_pci_slot_reset(struct pci_dn *pdn, if (pdn->eeh_pe_config_addr) config_addr = pdn->eeh_pe_config_addr; - rc = rtas_call(ibm_set_slot_reset,4,1, NULL, + rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL, config_addr, BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid), state); - if (rc) - printk (KERN_WARNING "EEH: Unable to reset the failed slot," - " (%d) #RST=%d dn=%s\n", - rc, state, pdn->node->full_name); + + /* Fundamental-reset not supported on this PE, try hot-reset */ + if (rc == -8 && state == 3) { + rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), 1); + if (rc) + printk(KERN_WARNING + "EEH: Unable to reset the failed slot," + " #RST=%d dn=%s\n", + rc, pdn->node->full_name); + } } /** @@ -738,18 +783,21 @@ int pcibios_set_pcie_reset_state(struct /** * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second * @pdn: pci device node to be reset. - * - * Return 0 if success, else a non-zero value. */ static void __rtas_set_slot_reset(struct pci_dn *pdn) { - struct pci_dev *dev = pdn->pcidev; + unsigned int freset = 0; - /* Determine type of EEH reset required by device, - * default hot reset or fundamental reset + /* Determine type of EEH reset required for + * Partitionable Endpoint, a hot-reset (1) + * or a fundamental reset (3). + * A fundamental reset required by any device under + * Partitionable Endpoint trumps hot-reset. */ - if (dev->needs_freset) + eeh_set_pe_freset(pdn->node, &freset); + + if (freset) rtas_pci_slot_reset(pdn, 3); else rtas_pci_slot_reset(pdn, 1); @@ -897,13 +945,20 @@ rtas_configure_bridge(struct pci_dn *pdn { int config_addr; int rc; + int token; /* Use PE configuration address, if present */ config_addr = pdn->eeh_config_addr; if (pdn->eeh_pe_config_addr) config_addr = pdn->eeh_pe_config_addr; - rc = rtas_call(ibm_configure_bridge,3,1, NULL, + /* Use new configure-pe function, if supported */ + if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) + token = ibm_configure_pe; + else + token = ibm_configure_bridge; + + rc = rtas_call(token, 3, 1, NULL, config_addr, BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid)); @@ -1079,6 +1134,7 @@ void __init eeh_init(void) ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); ibm_configure_bridge = rtas_token ("ibm,configure-bridge"); + ibm_configure_pe = rtas_token("ibm,configure-pe"); if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) return; diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/eeh_driver.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_driver.c --- linux-2.6.32/arch/powerpc/platforms/pseries/eeh_driver.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_driver.c 2015-03-10 13:22:46.000000000 -0700 @@ -344,7 +344,7 @@ struct pci_dn * handle_eeh_events (struc struct pci_bus *frozen_bus; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; - const char *location, *pci_str, *drv_str; + const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str; frozen_dn = find_device_pe(event->dn); if (!frozen_dn) { @@ -353,7 +353,7 @@ struct pci_dn * handle_eeh_events (struc location = location ? location : "unknown"; printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " "for location=%s pci addr=%s\n", - location, pci_name(event->dev)); + location, eeh_pci_name(event->dev)); return NULL; } @@ -380,22 +380,26 @@ struct pci_dn * handle_eeh_events (struc frozen_pdn = PCI_DN(frozen_dn); frozen_pdn->eeh_freeze_count++; - if (frozen_pdn->pcidev) { - pci_str = pci_name (frozen_pdn->pcidev); - drv_str = pcid_name (frozen_pdn->pcidev); - } else { - pci_str = pci_name (event->dev); - drv_str = pcid_name (event->dev); - } - + pci_str = eeh_pci_name(event->dev); + drv_str = pcid_name(event->dev); + if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; printk(KERN_WARNING "EEH: This PCI device has failed %d times in the last hour:\n", frozen_pdn->eeh_freeze_count); + + if (frozen_pdn->pcidev) { + bus_pci_str = pci_name(frozen_pdn->pcidev); + bus_drv_str = pcid_name(frozen_pdn->pcidev); + printk(KERN_WARNING + "EEH: Bus location=%s driver=%s pci addr=%s\n", + location, bus_drv_str, bus_pci_str); + } + printk(KERN_WARNING - "EEH: location=%s driver=%s pci addr=%s\n", + "EEH: Device location=%s driver=%s pci addr=%s\n", location, drv_str, pci_str); /* Walk the various device drivers attached to this slot through diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/eeh_event.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_event.c --- linux-2.6.32/arch/powerpc/platforms/pseries/eeh_event.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/eeh_event.c 2015-03-10 13:21:27.000000000 -0700 @@ -80,7 +80,7 @@ static int eeh_event_handler(void * dumm eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", - pci_name(event->dev)); + eeh_pci_name(event->dev)); pdn = handle_eeh_events(event); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/firmware.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/firmware.c --- linux-2.6.32/arch/powerpc/platforms/pseries/firmware.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/firmware.c 2015-03-10 13:22:11.000000000 -0700 @@ -55,6 +55,7 @@ firmware_features_table[FIRMWARE_MAX_FEA {FW_FEATURE_XDABR, "hcall-xdabr"}, {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, {FW_FEATURE_SPLPAR, "hcall-splpar"}, + {FW_FEATURE_VPHN, "hcall-vphn"}, }; /* Build up the firmware features bitmask using the contents of diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/hotplug-cpu.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hotplug-cpu.c --- linux-2.6.32/arch/powerpc/platforms/pseries/hotplug-cpu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hotplug-cpu.c 2015-03-10 13:21:56.000000000 -0700 @@ -30,6 +30,7 @@ #include #include "xics.h" #include "plpar_wrappers.h" +#include "offline_states.h" /* This version can't take the spinlock, because it never returns */ static struct rtas_args rtas_stop_self_args = { @@ -39,6 +40,55 @@ static struct rtas_args rtas_stop_self_a .rets = &rtas_stop_self_args.args[0], }; +static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = + CPU_STATE_OFFLINE; +static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; + +static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; + +static int cede_offline_enabled __read_mostly = 1; + +/* + * Enable/disable cede_offline when available. + */ +static int __init setup_cede_offline(char *str) +{ + if (!strcmp(str, "off")) + cede_offline_enabled = 0; + else if (!strcmp(str, "on")) + cede_offline_enabled = 1; + else + return 0; + return 1; +} + +__setup("cede_offline=", setup_cede_offline); + +enum cpu_state_vals get_cpu_current_state(int cpu) +{ + return per_cpu(current_state, cpu); +} + +void set_cpu_current_state(int cpu, enum cpu_state_vals state) +{ + per_cpu(current_state, cpu) = state; +} + +enum cpu_state_vals get_preferred_offline_state(int cpu) +{ + return per_cpu(preferred_offline_state, cpu); +} + +void set_preferred_offline_state(int cpu, enum cpu_state_vals state) +{ + per_cpu(preferred_offline_state, cpu) = state; +} + +void set_default_offline_state(int cpu) +{ + per_cpu(preferred_offline_state, cpu) = default_offline_state; +} + static void rtas_stop_self(void) { struct rtas_args *args = &rtas_stop_self_args; @@ -56,38 +106,54 @@ static void rtas_stop_self(void) static void pseries_mach_cpu_die(void) { + unsigned int cpu = smp_processor_id(); + unsigned int hwcpu = hard_smp_processor_id(); + u8 cede_latency_hint = 0; + local_irq_disable(); idle_task_exit(); xics_teardown_cpu(); - unregister_slb_shadow(hard_smp_processor_id(), __pa(get_slb_shadow())); - rtas_stop_self(); - /* Should never get here... */ - BUG(); - for(;;); -} -static int qcss_tok; /* query-cpu-stopped-state token */ + if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + set_cpu_current_state(cpu, CPU_STATE_INACTIVE); + pseries_suspend_cpu(); + + cede_latency_hint = 2; + + get_lppaca()->idle = 1; + if (!get_lppaca()->shared_proc) + get_lppaca()->donate_dedicated_cpu = 1; -/* Get state of physical CPU. - * Return codes: - * 0 - The processor is in the RTAS stopped state - * 1 - stop-self is in progress - * 2 - The processor is not in the RTAS stopped state - * -1 - Hardware Error - * -2 - Hardware Busy, Try again later. - */ -static int query_cpu_stopped(unsigned int pcpu) -{ - int cpu_status, status; + while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + extended_cede_processor(cede_latency_hint); + } - status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu); - if (status != 0) { - printk(KERN_ERR - "RTAS query-cpu-stopped-state failed: %i\n", status); - return status; + if (!get_lppaca()->shared_proc) + get_lppaca()->donate_dedicated_cpu = 0; + get_lppaca()->idle = 0; + + if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { + unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + + /* + * Call to start_secondary_resume() will not return. + * Kernel stack will be reset and start_secondary() + * will be called to continue the online operation. + */ + start_secondary_resume(); + } } - return cpu_status; + /* Requested state is CPU_STATE_OFFLINE at this point */ + WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE); + + set_cpu_current_state(cpu, CPU_STATE_OFFLINE); + unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + rtas_stop_self(); + + /* Should never get here... */ + BUG(); + for(;;); } static int pseries_cpu_disable(void) @@ -106,18 +172,44 @@ static int pseries_cpu_disable(void) return 0; } +/* + * pseries_cpu_die: Wait for the cpu to die. + * @cpu: logical processor id of the CPU whose death we're awaiting. + * + * This function is called from the context of the thread which is performing + * the cpu-offline. Here we wait for long enough to allow the cpu in question + * to self-destroy so that the cpu-offline thread can send the CPU_DEAD + * notifications. + * + * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to + * self-destruct. + */ static void pseries_cpu_die(unsigned int cpu) { int tries; - int cpu_status; + int cpu_status = 1; unsigned int pcpu = get_hard_smp_processor_id(cpu); - for (tries = 0; tries < 25; tries++) { - cpu_status = query_cpu_stopped(pcpu); - if (cpu_status == 0 || cpu_status == -1) - break; - cpu_relax(); + if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + cpu_status = 1; + for (tries = 0; tries < 5000; tries++) { + if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) { + cpu_status = 0; + break; + } + msleep(1); + } + } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + + for (tries = 0; tries < 25; tries++) { + cpu_status = smp_query_cpu_stopped(pcpu); + if (cpu_status == QCSS_STOPPED || + cpu_status == QCSS_HARDWARE_ERROR) + break; + cpu_relax(); + } } + if (cpu_status != 0) { printk("Querying DEAD? cpu %i (%i) shows %i\n", cpu, pcpu, cpu_status); @@ -252,10 +344,30 @@ static struct notifier_block pseries_smp .notifier_call = pseries_smp_notifier, }; +#define MAX_CEDE_LATENCY_LEVELS 4 +#define CEDE_LATENCY_PARAM_LENGTH 10 +#define CEDE_LATENCY_PARAM_MAX_LENGTH \ + (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char)) +#define CEDE_LATENCY_TOKEN 45 + +static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH]; + +static int parse_cede_parameters(void) +{ + memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH); + return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + CEDE_LATENCY_TOKEN, + __pa(cede_parameters), + CEDE_LATENCY_PARAM_MAX_LENGTH); +} + static int __init pseries_cpu_hotplug_init(void) { struct device_node *np; const char *typep; + int cpu; + int qcss_tok; for_each_node_by_name(np, "interrupt-controller") { typep = of_get_property(np, "compatible", NULL); @@ -283,8 +395,16 @@ static int __init pseries_cpu_hotplug_in smp_ops->cpu_die = pseries_cpu_die; /* Processors can be added/removed only on LPAR */ - if (firmware_has_feature(FW_FEATURE_LPAR)) + if (firmware_has_feature(FW_FEATURE_LPAR)) { pSeries_reconfig_notifier_register(&pseries_smp_nb); + cpu_maps_update_begin(); + if (cede_offline_enabled && parse_cede_parameters() == 0) { + default_offline_state = CPU_STATE_INACTIVE; + for_each_online_cpu(cpu) + set_default_offline_state(cpu); + } + cpu_maps_update_done(); + } return 0; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/hotplug-memory.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hotplug-memory.c --- linux-2.6.32/arch/powerpc/platforms/pseries/hotplug-memory.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hotplug-memory.c 2015-03-10 13:21:42.000000000 -0700 @@ -11,11 +11,55 @@ #include #include +#include #include #include #include #include +u32 memory_block_size_bytes(void) +{ + struct device_node *np; + unsigned int memblock_size = 0; + + np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (np) { + const unsigned long *size; + + size = of_get_property(np, "ibm,lmb-size", NULL); + memblock_size = size ? *size : 0; + + of_node_put(np); + } else { + unsigned int memzero_size = 0; + const unsigned int *regs; + + np = of_find_node_by_path("/memory@0"); + if (np) { + regs = of_get_property(np, "reg", NULL); + memzero_size = regs ? regs[3] : 0; + of_node_put(np); + } + + if (memzero_size) { + /* We now know the size of memory@0, use this to find + * the first memoryblock and get its size. + */ + char buf[64]; + + sprintf(buf, "/memory@%x", memzero_size); + np = of_find_node_by_path(buf); + if (np) { + regs = of_get_property(np, "reg", NULL); + memblock_size = regs ? regs[3] : 0; + of_node_put(np); + } + } + } + + return memblock_size; +} + static int pseries_remove_lmb(unsigned long base, unsigned int lmb_size) { unsigned long start, start_pfn; @@ -54,6 +98,12 @@ static int pseries_remove_lmb(unsigned l */ start = (unsigned long)__va(base); ret = remove_section_mapping(start, start + lmb_size); + + /* Ensure all vmalloc mappings are flushed in case they also + * hit that section of memory + */ + vm_unmap_aliases(); + return ret; } @@ -120,30 +170,22 @@ static int pseries_add_memory(struct dev static int pseries_drconf_memory(unsigned long *base, unsigned int action) { - struct device_node *np; - const unsigned long *lmb_size; + unsigned long memblock_size; int rc; - np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); - if (!np) - return -EINVAL; - - lmb_size = of_get_property(np, "ibm,lmb-size", NULL); - if (!lmb_size) { - of_node_put(np); - return -EINVAL; - } + memblock_size = memory_block_size_bytes(); + if (!memblock_size) + return -EINVAL; if (action == PSERIES_DRCONF_MEM_ADD) { - rc = lmb_add(*base, *lmb_size); + rc = lmb_add(*base, memblock_size); rc = (rc < 0) ? -EINVAL : 0; } else if (action == PSERIES_DRCONF_MEM_REMOVE) { - rc = pseries_remove_lmb(*base, *lmb_size); + rc = pseries_remove_lmb(*base, memblock_size); } else { rc = -EINVAL; } - of_node_put(np); return rc; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/hvconsole.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hvconsole.c --- linux-2.6.32/arch/powerpc/platforms/pseries/hvconsole.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/hvconsole.c 2015-03-10 13:22:52.000000000 -0700 @@ -73,7 +73,7 @@ int hvc_put_chars(uint32_t vtermno, cons if (ret == H_SUCCESS) return count; if (ret == H_BUSY) - return 0; + return -EAGAIN; return -EIO; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/iommu.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/iommu.c --- linux-2.6.32/arch/powerpc/platforms/pseries/iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/iommu.c 2015-03-10 13:23:58.000000000 -0700 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -45,6 +46,7 @@ #include #include #include +#include #include "plpar_wrappers.h" @@ -154,12 +156,15 @@ static int tce_buildmulti_pSeriesLP(stru long l, limit; long tcenum_start = tcenum, npages_start = npages; int ret = 0; + unsigned long flags; if (npages == 1) { return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } + local_irq_save(flags); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); /* This is safe to do since interrupts are off when we're called @@ -169,6 +174,7 @@ static int tce_buildmulti_pSeriesLP(stru tcep = (u64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { + local_irq_restore(flags); return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); } @@ -202,6 +208,8 @@ static int tce_buildmulti_pSeriesLP(stru tcenum += limit; } while (npages > 0 && !rc); + local_irq_restore(flags); + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; tce_freemulti_pSeriesLP(tbl, tcenum_start, @@ -270,6 +278,152 @@ static unsigned long tce_get_pSeriesLP(s return tce_ret; } +/* this is compatable with cells for the device tree property */ +struct dynamic_dma_window_prop { + __be32 liobn; /* tce table number */ + __be64 dma_base; /* address hi,lo */ + __be32 tce_shift; /* ilog2(tce_page_size) */ + __be32 window_shift; /* ilog2(tce_window_size) */ +}; + +struct direct_window { + struct device_node *device; + const struct dynamic_dma_window_prop *prop; + struct list_head list; +}; + +/* Dynamic DMA Window support */ +struct ddw_query_response { + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; +}; + +struct ddw_create_response { + u32 liobn; + u32 addr_hi; + u32 addr_lo; +}; + +static LIST_HEAD(direct_window_list); +/* prevents races between memory on/offline and window creation */ +static DEFINE_SPINLOCK(direct_window_list_lock); +/* protects initializing window twice for same device */ +static DEFINE_MUTEX(direct_window_init_mutex); +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" + +static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + int rc; + u64 tce_size, num_tce, dma_offset, next; + u32 tce_shift; + long limit; + + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 512); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), + dma_offset, + 0, limit); + num_tce -= limit; + } while (num_tce > 0 && !rc); + + return rc; +} + +static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn; + u32 tce_shift; + u64 rc = 0; + long l, limit; + + local_irq_disable(); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); + + if (!tcep) { + tcep = (u64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { + local_irq_enable(); + return -ENOMEM; + } + __get_cpu_var(tce_page) = tcep; + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + + liobn = (u64)be32_to_cpu(maprange->liobn); + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + for (l = 0; l < limit; l++) { + tcep[l] = proto_tce | next; + next += tce_size; + } + + rc = plpar_tce_put_indirect(liobn, + dma_offset, + (u64)virt_to_abs(tcep), + limit); + + num_tce -= limit; + } while (num_tce > 0 && !rc); + + /* error cleanup: caller will clear whole range */ + + local_irq_enable(); + return rc; +} + +static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, + unsigned long num_pfn, void *arg) +{ + return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); +} + + #ifdef CONFIG_PCI static void iommu_table_setparms(struct pci_controller *phb, struct device_node *dn, @@ -403,7 +557,7 @@ static void pci_dma_bus_setup_pSeries(st pci->phb->dma_window_size = 0x8000000ul; pci->phb->dma_window_base_cur = 0x8000000ul; - tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, pci->phb->node); iommu_table_setparms(pci->phb, dn, tbl); @@ -448,16 +602,13 @@ static void pci_dma_bus_setup_pSeriesLP( pdn->full_name, ppci->iommu_table); if (!ppci->iommu_table) { - tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, ppci->phb->node); iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window, bus->number); ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); pr_debug(" created table: %p\n", ppci->iommu_table); } - - if (pdn != dn) - PCI_DN(dn)->iommu_table = ppci->iommu_table; } @@ -478,7 +629,7 @@ static void pci_dma_dev_setup_pSeries(st struct pci_controller *phb = PCI_DN(dn)->phb; pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); - tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, phb->node); iommu_table_setparms(phb, dn, tbl); PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); @@ -500,6 +651,334 @@ static void pci_dma_dev_setup_pSeries(st pci_name(dev)); } +static int __read_mostly disable_ddw; + +static int __init disable_ddw_setup(char *str) +{ + disable_ddw = 1; + printk(KERN_INFO "ppc iommu: disabling ddw.\n"); + + return 0; +} + +early_param("disable_ddw", disable_ddw_setup); + +static void remove_ddw(struct device_node *np) +{ + struct dynamic_dma_window_prop *dwp; + struct property *win64; + const u32 *ddr_avail; + u64 liobn; + int len, ret; + + ddr_avail = of_get_property(np, "ibm,ddw-applicable", &len); + win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); + if (!win64 || !ddr_avail || len < 3 * sizeof(u32)) + return; + + dwp = win64->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + /* clear the whole window, note the arg is in kernel pages */ + ret = tce_clearrange_multi_pSeriesLP(0, + 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); + if (ret) + pr_warning("%s failed to clear tces in window.\n", + np->full_name); + else + pr_debug("%s successfully cleared tces in window.\n", + np->full_name); + + ret = rtas_call(ddr_avail[2], 1, 1, NULL, liobn); + if (ret) + pr_warning("%s: failed to remove direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddr_avail[2], liobn); + else + pr_debug("%s: successfully removed direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddr_avail[2], liobn); +} + + +static u64 dupe_ddw_if_already_created(struct pci_dev *dev, struct device_node *pdn) +{ + struct device_node *dn; + struct pci_dn *pcidn; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + spin_lock(&direct_window_list_lock); + /* check if we already created a window and dupe that config if so */ + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == pdn) { + direct64 = window->prop; + dma_addr = direct64->dma_base; + break; + } + } + spin_unlock(&direct_window_list_lock); + + return dma_addr; +} + +static u64 dupe_ddw_if_kexec(struct pci_dev *dev, struct device_node *pdn) +{ + struct device_node *dn; + struct pci_dn *pcidn; + int len; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); + if (direct64) { + if (len < sizeof(struct dynamic_dma_window_prop)) { + remove_ddw(pdn); + } else { + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) { + remove_ddw(pdn); + } else { + window->device = pdn; + window->prop = direct64; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + dma_addr = direct64->dma_base; + } + } + } + + return dma_addr; +} + +static int query_ddw(struct pci_dev *dev, const u32 *ddr_avail, + struct ddw_query_response *query) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + ret = rtas_call(ddr_avail[0], 3, 5, (u32 *)query, + cfg_addr, BUID_HI(buid), BUID_LO(buid)); + dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" + " returned %d\n", ddr_avail[0], cfg_addr, (u32)BUID_HI(buid), + (u32)BUID_LO(buid), ret); + return ret; +} + +static int create_ddw(struct pci_dev *dev, const u32 *ddr_avail, + struct ddw_create_response *create, int page_shift, + int window_shift) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + + do { + /* extra outputs are LIOBN and dma-addr (hi, lo) */ + ret = rtas_call(ddr_avail[1], 5, 4, (u32 *)create, cfg_addr, + BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + } while (rtas_busy_delay(ret)); + dev_info(&dev->dev, + "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " + "(liobn = 0x%x starting addr = %x %x)\n", ddr_avail[1], + cfg_addr, (u32)BUID_HI(buid), (u32)BUID_LO(buid), page_shift, + window_shift, ret, create->liobn, create->addr_hi, create->addr_lo); + + return ret; +} + +/* + * If the PE supports dynamic dma windows, and there is space for a table + * that can map all pages in a linear offset, then setup such a table, + * and record the dma-offset in the struct device. + * + * dev: the pci device we are checking + * pdn: the parent pe node with the ibm,dma_window property + * Future: also check if we can remap the base window for our base page size + * + * returns the dma offset for use by dma_set_mask + */ +static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +{ + int len, ret; + struct ddw_query_response query; + struct ddw_create_response create; + int page_shift; + u64 dma_addr, max_addr; + struct device_node *dn; + const u32 *uninitialized_var(ddr_avail); + struct direct_window *window; + struct property *win64; + struct dynamic_dma_window_prop *ddwprop; + + mutex_lock(&direct_window_init_mutex); + + dma_addr = dupe_ddw_if_already_created(dev, pdn); + if (dma_addr != 0) + goto out_unlock; + + dma_addr = dupe_ddw_if_kexec(dev, pdn); + if (dma_addr != 0) + goto out_unlock; + + /* + * the ibm,ddw-applicable property holds the tokens for: + * ibm,query-pe-dma-window + * ibm,create-pe-dma-window + * ibm,remove-pe-dma-window + * for the given node in that order. + * the property is actually in the parent, not the PE + */ + ddr_avail = of_get_property(pdn, "ibm,ddw-applicable", &len); + if (!ddr_avail || len < 3 * sizeof(u32)) + goto out_unlock; + + /* + * Query if there is a second window of size to map the + * whole partition. Query returns number of windows, largest + * block assigned to PE (partition endpoint), and two bitmasks + * of page sizes: supported and supported for migrate-dma. + */ + dn = pci_device_to_OF_node(dev); + ret = query_ddw(dev, ddr_avail, &query); + if (ret != 0) + goto out_unlock; + + if (query.windows_available == 0) { + /* + * no additional windows are available for this device. + * We might be able to reallocate the existing window, + * trading in for a larger page size. + */ + dev_dbg(&dev->dev, "no free dynamic windows"); + goto out_unlock; + } + if (query.page_size & 4) { + page_shift = 24; /* 16MB */ + } else if (query.page_size & 2) { + page_shift = 16; /* 64kB */ + } else if (query.page_size & 1) { + page_shift = 12; /* 4kB */ + } else { + dev_dbg(&dev->dev, "no supported direct page size in mask %x", + query.page_size); + goto out_unlock; + } + /* verify the window * number of ptes will map the partition */ + /* check largest block * page size > max memory hotplug addr */ + max_addr = memory_hotplug_max(); + if (query.largest_available_block < (max_addr >> page_shift)) { + dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u " + "%llu-sized pages\n", max_addr, query.largest_available_block, + 1ULL << page_shift); + goto out_unlock; + } + len = order_base_2(max_addr); + win64 = kzalloc(sizeof(struct property), GFP_KERNEL); + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property for 64bit dma window\n"); + goto out_unlock; + } + win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); + win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); + win64->length = sizeof(*ddwprop); + if (!win64->name || !win64->value) { + dev_info(&dev->dev, + "couldn't allocate property name and value\n"); + goto out_free_prop; + } + + ret = create_ddw(dev, ddr_avail, &create, page_shift, len); + if (ret != 0) + goto out_free_prop; + + ddwprop->liobn = cpu_to_be32(create.liobn); + ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2)); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(len); + + dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n", + create.liobn, dn->full_name); + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + goto out_clear_window; + + ret = walk_system_ram_range(0, lmb_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map direct window for %s: %d\n", + dn->full_name, ret); + goto out_clear_window; + } + + ret = prom_add_property(pdn, win64); + if (ret) { + dev_err(&dev->dev, "unable to add dma window property for %s: %d", + pdn->full_name, ret); + goto out_clear_window; + } + + window->device = pdn; + window->prop = ddwprop; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + + dma_addr = of_read_number(&create.addr_hi, 2); + goto out_unlock; + +out_clear_window: + remove_ddw(pdn); + +out_free_prop: + kfree(win64->name); + kfree(win64->value); + kfree(win64); + +out_unlock: + mutex_unlock(&direct_window_init_mutex); + return dma_addr; +} + static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) { struct device_node *pdn, *dn; @@ -544,7 +1023,7 @@ static void pci_dma_dev_setup_pSeriesLP( pci = PCI_DN(pdn); if (!pci->iommu_table) { - tbl = kmalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, pci->phb->node); iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window, pci->phb->bus->number); @@ -556,24 +1035,145 @@ static void pci_dma_dev_setup_pSeriesLP( set_iommu_table_base(&dev->dev, pci->iommu_table); } + +static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +{ + bool ddw_enabled = false; + struct device_node *pdn, *dn; + struct pci_dev *pdev; + const void *dma_window = NULL; + u64 dma_offset; + + if (!dev->dma_mask) + return -EIO; + + if (!dev_is_pci(dev)) + goto check_mask; + + pdev = to_pci_dev(dev); + + /* only attempt to use a new window if 64-bit DMA is requested */ + if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { + dn = pci_device_to_OF_node(pdev); + dev_dbg(dev, "node is %s\n", dn->full_name); + + /* + * the device tree might contain the dma-window properties + * per-device and not neccesarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; + } + if (pdn && PCI_DN(pdn)) { + dma_offset = enable_ddw(pdev, pdn); + if (dma_offset != 0) { + dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); + set_dma_offset(dev, dma_offset); + set_dma_ops(dev, &dma_direct_ops); + ddw_enabled = true; + } + } + } + + /* fall back on iommu ops, restore table pointer with ops */ + if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) { + dev_info(dev, "Restoring 32-bit DMA via iommu\n"); + set_dma_ops(dev, &dma_iommu_ops); + pci_dma_dev_setup_pSeriesLP(pdev); + } + +check_mask: + if (!dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + return 0; +} + #else /* CONFIG_PCI */ #define pci_dma_bus_setup_pSeries NULL #define pci_dma_dev_setup_pSeries NULL #define pci_dma_bus_setup_pSeriesLP NULL #define pci_dma_dev_setup_pSeriesLP NULL +#define dma_set_mask_pSeriesLP NULL #endif /* !CONFIG_PCI */ +static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct direct_window *window; + struct memory_notify *arg = data; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + default: + break; + } + if (ret && action != MEM_CANCEL_ONLINE) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static struct notifier_block iommu_mem_nb = { + .notifier_call = iommu_mem_notifier, +}; + static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) { int err = NOTIFY_OK; struct device_node *np = node; struct pci_dn *pci = PCI_DN(np); + struct direct_window *window; switch (action) { case PSERIES_RECONFIG_REMOVE: - if (pci && pci->iommu_table && - of_get_property(np, "ibm,dma-window", NULL)) + if (pci && pci->iommu_table) iommu_free_table(pci->iommu_table, np->full_name); + + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == np) { + list_del(&window->list); + kfree(window); + break; + } + } + spin_unlock(&direct_window_list_lock); + + /* + * Because the notifier runs after isolation of the + * slot, we are guaranteed any DMA window has already + * been revoked and the TCEs have been marked invalid, + * so we don't need a call to remove_ddw(np). However, + * if an additional notifier action is added before the + * isolate call, we should update this code for + * completeness with such a call. + */ break; default: err = NOTIFY_DONE; @@ -608,6 +1208,7 @@ void iommu_init_early_pSeries(void) ppc_md.tce_get = tce_get_pSeriesLP; ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP; ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP; + ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; } else { ppc_md.tce_build = tce_build_pSeries; ppc_md.tce_free = tce_free_pSeries; @@ -618,6 +1219,7 @@ void iommu_init_early_pSeries(void) pSeries_reconfig_notifier_register(&iommu_reconfig_nb); + register_memory_notifier(&iommu_mem_nb); set_pci_dma_ops(&dma_iommu_ops); } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/Kconfig linux-2.6.32.ovz/arch/powerpc/platforms/pseries/Kconfig --- linux-2.6.32/arch/powerpc/platforms/pseries/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/Kconfig 2015-03-10 13:22:01.000000000 -0700 @@ -30,6 +30,16 @@ config PSERIES_MSI depends on PCI_MSI && EEH default y +config PSERIES_ENERGY + tristate "pSeries energy management capabilities driver" + depends on PPC_PSERIES + default m + help + Provides interface to platform energy management capabilities + on supported PSERIES platforms. + Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list + and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint + config SCANLOG tristate "Scanlog dump interface" depends on RTAS_PROC && PPC_PSERIES @@ -59,7 +69,7 @@ config PPC_SMLPAR config CMM tristate "Collaborative memory management" - depends on PPC_SMLPAR && !CRASH_DUMP + depends on PPC_SMLPAR default y help Select this option, if you want to enable the kernel interface diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/kexec.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/kexec.c --- linux-2.6.32/arch/powerpc/platforms/pseries/kexec.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/kexec.c 2015-03-10 13:22:52.000000000 -0700 @@ -23,6 +23,17 @@ static void pseries_kexec_cpu_down(int c /* Don't risk a hypervisor call if we're crashing */ if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) { unsigned long addr; + int ret; + + if (get_lppaca()->dtl_enable_mask) { + ret = unregister_dtl(hard_smp_processor_id()); + if (ret) { + pr_err("WARNING: DTL deregistration for cpu " + "%d (hw %d) failed with %d\n", + smp_processor_id(), + hard_smp_processor_id(), ret); + } + } addr = __pa(get_slb_shadow()); if (unregister_slb_shadow(hard_smp_processor_id(), addr)) diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/lpar.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/lpar.c --- linux-2.6.32/arch/powerpc/platforms/pseries/lpar.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/lpar.c 2015-03-10 13:23:53.000000000 -0700 @@ -247,6 +247,8 @@ void vpa_init(int cpu) int hwcpu = get_hard_smp_processor_id(cpu); unsigned long addr; long ret; + struct paca_struct *pp; + struct dtl_entry *dtl; if (cpu_has_feature(CPU_FTR_ALTIVEC)) lppaca[cpu].vmxregs_in_use = 1; @@ -273,6 +275,25 @@ void vpa_init(int cpu) "registration for cpu %d (hw %d) of area %lx " "returns %ld\n", cpu, hwcpu, addr, ret); } + + /* + * Register dispatch trace log, if one has been allocated. + */ + pp = &paca[cpu]; + dtl = pp->dispatch_log; + if (dtl) { + pp->dtl_ridx = 0; + pp->dtl_curr = dtl; + lppaca[cpu].dtl_idx = 0; + + /* hypervisor reads buffer length from this field */ + dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + ret = register_dtl(hwcpu, __pa(dtl)); + if (ret) + pr_warning("DTL registration failed for cpu %d (%ld)\n", + cpu, ret); + lppaca[cpu].dtl_enable_mask = 2; + } } static long pSeries_lpar_hpte_insert(unsigned long hpte_group, @@ -307,6 +328,8 @@ static long pSeries_lpar_hpte_insert(uns /* Make pHyp happy */ if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU)) hpte_r &= ~_PAGE_COHERENT; + if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) + flags |= H_COALESCE_CAND; lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot); if (unlikely(lpar_rc == H_PTEG_FULL)) { @@ -353,7 +376,13 @@ static long pSeries_lpar_hpte_remove(uns (0x1UL << 4), &dummy1, &dummy2); if (lpar_rc == H_SUCCESS) return i; - BUG_ON(lpar_rc != H_NOT_FOUND); + + /* + * The test for adjunct partition is performed before the + * ANDCOND test. H_RESOURCE may be returned, so we need to + * check for that as well. + */ + BUG_ON(lpar_rc != H_NOT_FOUND && lpar_rc != H_RESOURCE); slot_offset++; slot_offset &= 0x7; @@ -368,7 +397,7 @@ static void pSeries_lpar_hptab_clear(voi unsigned long hpte_count = size_bytes >> 4; unsigned long dummy1, dummy2, dword0; long lpar_rc; - int i; + unsigned long i; /* TODO: Use bulk call */ for (i = 0; i < hpte_count; i++) { @@ -661,3 +690,47 @@ void arch_free_page(struct page *page, i EXPORT_SYMBOL(arch_free_page); #endif + +/** + * h_get_mpp + * H_GET_MPP hcall returns info in 7 parms + */ +int h_get_mpp(struct hvcall_mpp_data *mpp_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + rc = plpar_hcall9(H_GET_MPP, retbuf); + + mpp_data->entitled_mem = retbuf[0]; + mpp_data->mapped_mem = retbuf[1]; + + mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; + mpp_data->pool_num = retbuf[2] & 0xffff; + + mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; + mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; + mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff; + + mpp_data->pool_size = retbuf[4]; + mpp_data->loan_request = retbuf[5]; + mpp_data->backing_mem = retbuf[6]; + + return rc; +} +EXPORT_SYMBOL(h_get_mpp); + +int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 }; + + rc = plpar_hcall9(H_GET_MPP_X, retbuf); + + mpp_x_data->coalesced_bytes = retbuf[0]; + mpp_x_data->pool_coalesced_bytes = retbuf[1]; + mpp_x_data->pool_purr_cycles = retbuf[2]; + mpp_x_data->pool_spurr_cycles = retbuf[3]; + + return rc; +} diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/Makefile linux-2.6.32.ovz/arch/powerpc/platforms/pseries/Makefile --- linux-2.6.32/arch/powerpc/platforms/pseries/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/Makefile 2015-03-10 13:23:44.000000000 -0700 @@ -8,7 +8,7 @@ endif obj-y := lpar.o hvCall.o nvram.o reconfig.o \ setup.o iommu.o ras.o rtasd.o \ - firmware.o power.o + firmware.o power.o dlpar.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_XICS) += xics.o obj-$(CONFIG_SCANLOG) += scanlog.o @@ -16,6 +16,7 @@ obj-$(CONFIG_EEH) += eeh.o eeh_cache.o e obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_PCI) += pci.o pci_dlpar.o obj-$(CONFIG_PSERIES_MSI) += msi.o +obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o @@ -23,6 +24,9 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug- obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o obj-$(CONFIG_HVCS) += hvcserver.o obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o -obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_DTL) += dtl.o + +ifeq ($(CONFIG_PPC_PSERIES),y) +obj-$(CONFIG_SUSPEND) += suspend.o +endif diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/msi.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/msi.c --- linux-2.6.32/arch/powerpc/platforms/pseries/msi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/msi.c 2015-03-10 13:22:16.000000000 -0700 @@ -93,8 +93,18 @@ static void rtas_disable_msi(struct pci_ if (!pdn) return; - if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) - pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + /* + * disabling MSI with the explicit interface also disables MSI-X + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) { + /* + * may have failed because explicit interface is not + * present + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) { + pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + } + } } static int rtas_query_irq_number(struct pci_dn *pdn, int offset) diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/nvram.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/nvram.c --- linux-2.6.32/arch/powerpc/platforms/pseries/nvram.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/nvram.c 2015-03-10 13:24:13.000000000 -0700 @@ -17,17 +17,110 @@ #include #include #include +#include +#include +#include #include #include #include #include #include +/* + * Set oops header version to distingush between old and new format header. + * lnx,oops-log partition max size is 4000, header version > 4000 will + * help in identifying new header. + */ +#define OOPS_HDR_VERSION 5000 + static unsigned int nvram_size; static int nvram_fetch, nvram_store; static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ static DEFINE_SPINLOCK(nvram_lock); +struct err_log_info { + int error_type; + unsigned int seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ + bool os_partition; /* partition initialized by OS, not FW */ +}; + +static struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1, + .os_partition = true +}; + +static struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1, + .os_partition = true +}; + +static const char *const pseries_nvram_os_partitions[] = { + "ibm,rtas-log", + "lnx,oops-log", + NULL +}; + +struct oops_log_info { + u16 version; + u16 report_length; + u64 timestamp; +} __attribute__((packed)); + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* See clobbering_unread_rtas_event() */ +#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ +static unsigned long last_unread_rtas_event; /* timestamp */ + +/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */ +static char *oops_data; +static size_t oops_data_sz; +static char *oops_buf; + +#ifdef CONFIG_PSTORE +static struct nvram_os_partition of_config_partition = { + .name = "of-config", + .index = -1, + .os_partition = false +}; + +static struct nvram_os_partition common_partition = { + .name = "common", + .index = -1, + .os_partition = false +}; + +static enum pstore_type_id nvram_type_ids[] = { + PSTORE_TYPE_DMESG, + PSTORE_TYPE_PPC_RTAS, + PSTORE_TYPE_PPC_OF, + PSTORE_TYPE_PPC_COMMON, + -1 +}; +static int read_type; +static unsigned long last_rtas_event; +#endif static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) { @@ -120,6 +213,476 @@ static ssize_t pSeries_nvram_get_size(vo return nvram_size ? nvram_size : -ENODEV; } + +/* nvram_write_os_partition, nvram_write_error_log + * + * We need to buffer the error logs into nvram to ensure that we have + * the failure information to decode. If we have a severe error there + * is no way to guarantee that the OS or the machine is in a state to + * get back to user land and write the error to disk. For example if + * the SCSI device driver causes a Machine Check by writing to a bad + * IO address, there is no way of guaranteeing that the device driver + * is in any state that is would also be able to write the error data + * captured to disk, thus we buffer it in NVRAM for analysis on the + * next boot. + * + * In NVRAM the partition containing the error log buffer will looks like: + * Header (in bytes): + * +-----------+----------+--------+------------+------------------+ + * | signature | checksum | length | name | data | + * |0 |1 |2 3|4 15|16 length-1| + * +-----------+----------+--------+------------+------------------+ + * + * The 'data' section would look like (in bytes): + * +--------------+------------+-----------------------------------+ + * | event_logged | sequence # | error log | + * |0 3|4 7|8 error_log_size-1| + * +--------------+------------+-----------------------------------+ + * + * event_logged: 0 if event has not been logged to syslog, 1 if it has + * sequence #: The unique sequence # for each event. (until it wraps) + * error log: The error log from event_scan + */ +int nvram_write_os_partition(struct nvram_os_partition *part, char * buff, + int length, unsigned int err_type, unsigned int error_log_cnt) +{ + int rc; + loff_t tmp_index; + struct err_log_info info; + + if (part->index < 0) { + return -ESPIPE; + } + + if (length > part->size) { + length = part->size; + } + + info.error_type = err_type; + info.seq_num = error_log_cnt; + + tmp_index = part->index; + + rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); + return rc; + } + + rc = ppc_md.nvram_write(buff, length, &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); + return rc; + } + + return 0; +} + +int nvram_write_error_log(char * buff, int length, + unsigned int err_type, unsigned int error_log_cnt) +{ + int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); + if (!rc) { + last_unread_rtas_event = get_seconds(); +#ifdef CONFIG_PSTORE + last_rtas_event = get_seconds(); +#endif + } + + return rc; +} + +/* nvram_read_partition + * + * Reads nvram partition for at most 'length' + */ +int nvram_read_partition(struct nvram_os_partition *part, char *buff, + int length, unsigned int *err_type, + unsigned int *error_log_cnt) +{ + int rc; + loff_t tmp_index; + struct err_log_info info; + + if (part->index == -1) + return -1; + + if (length > part->size) + length = part->size; + + tmp_index = part->index; + + if (part->os_partition) { + rc = ppc_md.nvram_read((char *)&info, + sizeof(struct err_log_info), + &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, + rc); + return rc; + } + } + + rc = ppc_md.nvram_read(buff, length, &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_read (%d)\n", __FUNCTION__, rc); + return rc; + } + + if (part->os_partition) { + *error_log_cnt = info.seq_num; + *err_type = info.error_type; + } + + return 0; +} + +/* nvram_read_error_log + * + * Reads nvram for error log for at most 'length' + */ +int nvram_read_error_log(char *buff, int length, + unsigned int *err_type, unsigned int *error_log_cnt) +{ + return nvram_read_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); +} + + +/* This doesn't actually zero anything, but it sets the event_logged + * word to tell that this event is safely in syslog. + */ +int nvram_clear_error_log(void) +{ + loff_t tmp_index; + int clear_word = ERR_FLAG_ALREADY_LOGGED; + int rc; + + if (rtas_log_partition.index < 0) + return -1; + + tmp_index = rtas_log_partition.index; + + rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); + return rc; + } + last_unread_rtas_event = 0; + + return 0; +} + +/* pseries_nvram_init_os_partition + * + * This sets up a partition with an "OS" signature. + * + * The general strategy is the following: + * 1.) If a partition with the indicated name already exists... + * - If it's large enough, use it. + * - Otherwise, recycle it and keep going. + * 2.) Search for a free partition that is large enough. + * 3.) If there's not a free partition large enough, recycle any obsolete + * OS partitions and try again. + * 4.) Will first try getting a chunk that will satisfy the requested size. + * 5.) If a chunk of the requested size cannot be allocated, then try finding + * a chunk that will satisfy the minimum needed. + * + * Returns 0 on success, else -1. + */ +static int __init pseries_nvram_init_os_partition(struct nvram_os_partition + *part) +{ + loff_t p; + int size; + + /* Scan nvram for partitions */ + nvram_scan_partitions(); + + /* Look for ours */ + p = nvram_find_partition2(part->name, NVRAM_SIG_OS, &size); + + /* Found one but too small, remove it */ + if (p && size < part->min_size) { + pr_info("nvram: Found too small %s partition," + " removing it...\n", part->name); + nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL); + p = 0; + } + + /* Create one if we didn't find */ + if (!p) { + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); + if (p == -ENOSPC) { + pr_info("nvram: No room to create %s partition, " + "deleting any obsolete OS partitions...\n", + part->name); + nvram_remove_partition(NULL, NVRAM_SIG_OS, + pseries_nvram_os_partitions); + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); + } + } + + if (p <= 0) { + pr_err("nvram: Failed to find or create %s" + " partition, err %d\n", part->name, (int)p); + return -1; + } + + part->index = p; + part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info); + + return 0; +} + +/* + * Are we using the ibm,rtas-log for oops/panic reports? And if so, + * would logging this oops/panic overwrite an RTAS event that rtas_errd + * hasn't had a chance to read and process? Return 1 if so, else 0. + * + * We assume that if rtas_errd hasn't read the RTAS event in + * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. + */ +static int clobbering_unread_rtas_event(void) +{ + return (oops_log_partition.index == rtas_log_partition.index + && last_unread_rtas_event + && get_seconds() - last_unread_rtas_event <= + NVRAM_RTAS_READ_TIMEOUT); +} + +#ifdef CONFIG_PSTORE +static int nvram_pstore_open(struct pstore_info *psi) +{ + /* Reset the iterator to start reading partitions again */ + read_type = -1; + return 0; +} + +/** + * nvram_pstore_write - pstore write callback for nvram + * @type: Type of message logged + * @reason: reason behind dump (oops/panic) + * @id: identifier to indicate the write performed + * @part: pstore writes data to registered buffer in parts, + * part number will indicate the same. + * @count: Indicates oops count + * @size: number of bytes written to the registered buffer + * @psi: registered pstore_info structure + * + * Called by pstore_dump() when an oops or panic report is logged in the + * printk buffer. + * Returns 0 on successful write. + */ +static int nvram_pstore_write(enum pstore_type_id type, + enum kmsg_dump_reason reason, + u64 *id, unsigned int part, int count, + size_t size, struct pstore_info *psi) +{ + int rc; + struct oops_log_info *oops_hdr = (struct oops_log_info *) oops_buf; + + /* part 1 has the recent messages from printk buffer */ + if (part > 1 || type != PSTORE_TYPE_DMESG || + clobbering_unread_rtas_event()) + return -1; + + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) size; + oops_hdr->timestamp = get_seconds(); + rc = nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) (sizeof(*oops_hdr) + size), ERR_TYPE_KERNEL_PANIC, + count); + + if (rc != 0) + return rc; + + *id = part; + return 0; +} + +/* + * Reads the oops/panic report, rtas, of-config and common partition. + * Returns the length of the data we read from each partition. + * Returns 0 if we've been called before. + */ +static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type, + int *count, struct timespec *time, char **buf, + struct pstore_info *psi) +{ + struct oops_log_info *oops_hdr; + unsigned int err_type, id_no, size = 0; + struct nvram_os_partition *part = NULL; + char *buff = NULL; + int sig = 0; + loff_t p; + + read_type++; + + switch (nvram_type_ids[read_type]) { + case PSTORE_TYPE_DMESG: + part = &oops_log_partition; + *type = PSTORE_TYPE_DMESG; + break; + case PSTORE_TYPE_PPC_RTAS: + part = &rtas_log_partition; + *type = PSTORE_TYPE_PPC_RTAS; + time->tv_sec = last_rtas_event; + time->tv_nsec = 0; + break; + case PSTORE_TYPE_PPC_OF: + sig = NVRAM_SIG_OF; + part = &of_config_partition; + *type = PSTORE_TYPE_PPC_OF; + *id = PSTORE_TYPE_PPC_OF; + time->tv_sec = 0; + time->tv_nsec = 0; + break; + case PSTORE_TYPE_PPC_COMMON: + sig = NVRAM_SIG_SYS; + part = &common_partition; + *type = PSTORE_TYPE_PPC_COMMON; + *id = PSTORE_TYPE_PPC_COMMON; + time->tv_sec = 0; + time->tv_nsec = 0; + break; + default: + return 0; + } + + if (!part->os_partition) { + p = nvram_find_partition2(part->name, sig, &size); + if (p <= 0) { + pr_err("nvram: Failed to find partition %s, " + "err %d\n", part->name, (int)p); + return 0; + } + part->index = p; + part->size = size; + } + + buff = kmalloc(part->size, GFP_KERNEL); + + if (!buff) + return -ENOMEM; + + if (nvram_read_partition(part, buff, part->size, &err_type, &id_no)) { + kfree(buff); + return 0; + } + + *count = 0; + + if (part->os_partition) + *id = id_no; + + if (nvram_type_ids[read_type] == PSTORE_TYPE_DMESG) { + int length; + size_t hdr_size; + + oops_hdr = (struct oops_log_info *)buff; + if (oops_hdr->version < OOPS_HDR_VERSION) { + hdr_size = sizeof(u16); + length = oops_hdr->version; + time->tv_sec = 0; + time->tv_nsec = 0; + } else { + hdr_size = sizeof(*oops_hdr); + length = oops_hdr->report_length; + time->tv_sec = oops_hdr->timestamp; + time->tv_nsec = 0; + } + + *buf = kmalloc(length, GFP_KERNEL); + if (*buf == NULL) + return -ENOMEM; + memcpy(*buf, buff + hdr_size, length); + kfree(buff); + return length; + } + + *buf = buff; + return part->size; +} + +static struct pstore_info nvram_pstore_info = { + .owner = THIS_MODULE, + .name = "nvram", + .open = nvram_pstore_open, + .read = nvram_pstore_read, + .write = nvram_pstore_write, +}; + +static int nvram_pstore_init(void) +{ + int rc = 0; + + nvram_pstore_info.buf = oops_data; + nvram_pstore_info.bufsize = oops_data_sz; + + rc = pstore_register(&nvram_pstore_info); + if (rc != 0) + pr_err("nvram: pstore_register() failed, defaults to " + "kmsg_dump; returned %d\n", rc); + + return rc; +} +#else +static int nvram_pstore_init(void) +{ + return -1; +} +#endif + +static void __init nvram_init_oops_partition(int rtas_partition_exists) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&oops_log_partition); + if (rc != 0) { + if (!rtas_partition_exists) + return; + pr_notice("nvram: Using %s partition to log both" + " RTAS errors and oops/panic reports\n", + rtas_log_partition.name); + memcpy(&oops_log_partition, &rtas_log_partition, + sizeof(rtas_log_partition)); + } + oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL); + if (!oops_buf) { + pr_err("nvram: No memory for %s partition\n", + oops_log_partition.name); + return; + } + oops_data = oops_buf + sizeof(struct oops_log_info); + oops_data_sz = oops_log_partition.size - sizeof(struct oops_log_info); + + rc = nvram_pstore_init(); + + if (!rc) + return; + + rc = kmsg_dump_register(&nvram_kmsg_dumper); + if (rc != 0) { + pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); + kfree(oops_buf); + return; + } +} + +static int __init pseries_nvram_init_log_partitions(void) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&rtas_log_partition); + nvram_init_oops_partition(rc == 0); + return 0; +} +machine_arch_initcall(pseries, pseries_nvram_init_log_partitions); + int __init pSeries_nvram_init(void) { struct device_node *nvram; @@ -149,3 +712,50 @@ int __init pSeries_nvram_init(void) return 0; } + +/* + * Try to capture the last capture_len bytes of the printk buffer. Return + * the amount actually captured. + */ +static size_t capture_last_msgs(const char *old_msgs, size_t old_len, + const char *new_msgs, size_t new_len, + char *captured, size_t capture_len) +{ + if (new_len >= capture_len) { + memcpy(captured, new_msgs + (new_len - capture_len), + capture_len); + return capture_len; + } else { + /* Grab the end of old_msgs. */ + size_t old_tail_len = min(old_len, capture_len - new_len); + memcpy(captured, old_msgs + (old_len - old_tail_len), + old_tail_len); + memcpy(captured + old_tail_len, new_msgs, new_len); + return old_tail_len + new_len; + } +} + + +/* our kmsg_dump callback */ +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len) +{ + struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; + static unsigned int oops_count = 0; + size_t text_len; + + if (clobbering_unread_rtas_event()) + return; + + text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len, + oops_data, oops_data_sz); + oops_hdr->version = OOPS_HDR_VERSION; + oops_hdr->report_length = (u16) text_len; + oops_hdr->timestamp = get_seconds(); + + nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) (sizeof(*oops_hdr) + text_len), + ERR_TYPE_KERNEL_PANIC, ++oops_count); +} diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/offline_states.h linux-2.6.32.ovz/arch/powerpc/platforms/pseries/offline_states.h --- linux-2.6.32/arch/powerpc/platforms/pseries/offline_states.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/offline_states.h 2015-03-10 13:21:24.000000000 -0700 @@ -0,0 +1,19 @@ +#ifndef _OFFLINE_STATES_H_ +#define _OFFLINE_STATES_H_ + +/* Cpu offline states go here */ +enum cpu_state_vals { + CPU_STATE_OFFLINE, + CPU_STATE_INACTIVE, + CPU_STATE_ONLINE, + CPU_MAX_OFFLINE_STATES +}; + +extern enum cpu_state_vals get_cpu_current_state(int cpu); +extern void set_cpu_current_state(int cpu, enum cpu_state_vals state); +extern enum cpu_state_vals get_preferred_offline_state(int cpu); +extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state); +extern void set_default_offline_state(int cpu); +extern int start_secondary(void); +extern void start_secondary_resume(void); +#endif diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/phyp_dump.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/phyp_dump.c --- linux-2.6.32/arch/powerpc/platforms/pseries/phyp_dump.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/phyp_dump.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,512 +0,0 @@ -/* - * Hypervisor-assisted dump - * - * Linas Vepstas, Manish Ahuja 2008 - * Copyright 2008 IBM Corp. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -/* Variables, used to communicate data between early boot and late boot */ -static struct phyp_dump phyp_dump_vars; -struct phyp_dump *phyp_dump_info = &phyp_dump_vars; - -static int ibm_configure_kernel_dump; -/* ------------------------------------------------- */ -/* RTAS interfaces to declare the dump regions */ - -struct dump_section { - u32 dump_flags; - u16 source_type; - u16 error_flags; - u64 source_address; - u64 source_length; - u64 length_copied; - u64 destination_address; -}; - -struct phyp_dump_header { - u32 version; - u16 num_of_sections; - u16 status; - - u32 first_offset_section; - u32 dump_disk_section; - u64 block_num_dd; - u64 num_of_blocks_dd; - u32 offset_dd; - u32 maxtime_to_auto; - /* No dump disk path string used */ - - struct dump_section cpu_data; - struct dump_section hpte_data; - struct dump_section kernel_data; -}; - -/* The dump header *must be* in low memory, so .bss it */ -static struct phyp_dump_header phdr; - -#define NUM_DUMP_SECTIONS 3 -#define DUMP_HEADER_VERSION 0x1 -#define DUMP_REQUEST_FLAG 0x1 -#define DUMP_SOURCE_CPU 0x0001 -#define DUMP_SOURCE_HPTE 0x0002 -#define DUMP_SOURCE_RMO 0x0011 -#define DUMP_ERROR_FLAG 0x2000 -#define DUMP_TRIGGERED 0x4000 -#define DUMP_PERFORMED 0x8000 - - -/** - * init_dump_header() - initialize the header declaring a dump - * Returns: length of dump save area. - * - * When the hypervisor saves crashed state, it needs to put - * it somewhere. The dump header tells the hypervisor where - * the data can be saved. - */ -static unsigned long init_dump_header(struct phyp_dump_header *ph) -{ - unsigned long addr_offset = 0; - - /* Set up the dump header */ - ph->version = DUMP_HEADER_VERSION; - ph->num_of_sections = NUM_DUMP_SECTIONS; - ph->status = 0; - - ph->first_offset_section = - (u32)offsetof(struct phyp_dump_header, cpu_data); - ph->dump_disk_section = 0; - ph->block_num_dd = 0; - ph->num_of_blocks_dd = 0; - ph->offset_dd = 0; - - ph->maxtime_to_auto = 0; /* disabled */ - - /* The first two sections are mandatory */ - ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG; - ph->cpu_data.source_type = DUMP_SOURCE_CPU; - ph->cpu_data.source_address = 0; - ph->cpu_data.source_length = phyp_dump_info->cpu_state_size; - ph->cpu_data.destination_address = addr_offset; - addr_offset += phyp_dump_info->cpu_state_size; - - ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG; - ph->hpte_data.source_type = DUMP_SOURCE_HPTE; - ph->hpte_data.source_address = 0; - ph->hpte_data.source_length = phyp_dump_info->hpte_region_size; - ph->hpte_data.destination_address = addr_offset; - addr_offset += phyp_dump_info->hpte_region_size; - - /* This section describes the low kernel region */ - ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG; - ph->kernel_data.source_type = DUMP_SOURCE_RMO; - ph->kernel_data.source_address = PHYP_DUMP_RMR_START; - ph->kernel_data.source_length = PHYP_DUMP_RMR_END; - ph->kernel_data.destination_address = addr_offset; - addr_offset += ph->kernel_data.source_length; - - return addr_offset; -} - -static void print_dump_header(const struct phyp_dump_header *ph) -{ -#ifdef DEBUG - if (ph == NULL) - return; - - printk(KERN_INFO "dump header:\n"); - /* setup some ph->sections required */ - printk(KERN_INFO "version = %d\n", ph->version); - printk(KERN_INFO "Sections = %d\n", ph->num_of_sections); - printk(KERN_INFO "Status = 0x%x\n", ph->status); - - /* No ph->disk, so all should be set to 0 */ - printk(KERN_INFO "Offset to first section 0x%x\n", - ph->first_offset_section); - printk(KERN_INFO "dump disk sections should be zero\n"); - printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section); - printk(KERN_INFO "block num = %lld\n", ph->block_num_dd); - printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd); - printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd); - printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto); - - /*set cpu state and hpte states as well scratch pad area */ - printk(KERN_INFO " CPU AREA \n"); - printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags); - printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type); - printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags); - printk(KERN_INFO "cpu source_address =%llx\n", - ph->cpu_data.source_address); - printk(KERN_INFO "cpu source_length =%llx\n", - ph->cpu_data.source_length); - printk(KERN_INFO "cpu length_copied =%llx\n", - ph->cpu_data.length_copied); - - printk(KERN_INFO " HPTE AREA \n"); - printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags); - printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type); - printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags); - printk(KERN_INFO "HPTE source_address =%llx\n", - ph->hpte_data.source_address); - printk(KERN_INFO "HPTE source_length =%llx\n", - ph->hpte_data.source_length); - printk(KERN_INFO "HPTE length_copied =%llx\n", - ph->hpte_data.length_copied); - - printk(KERN_INFO " SRSD AREA \n"); - printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags); - printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type); - printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags); - printk(KERN_INFO "SRSD source_address =%llx\n", - ph->kernel_data.source_address); - printk(KERN_INFO "SRSD source_length =%llx\n", - ph->kernel_data.source_length); - printk(KERN_INFO "SRSD length_copied =%llx\n", - ph->kernel_data.length_copied); -#endif -} - -static ssize_t show_phyp_dump_active(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - - /* create filesystem entry so kdump is phyp-dump aware */ - return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot); -} - -static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600, - show_phyp_dump_active, - NULL); - -static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr) -{ - int rc; - - /* Add addr value if not initialized before */ - if (ph->cpu_data.destination_address == 0) { - ph->cpu_data.destination_address += addr; - ph->hpte_data.destination_address += addr; - ph->kernel_data.destination_address += addr; - } - - /* ToDo Invalidate kdump and free memory range. */ - - do { - rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, - 1, ph, sizeof(struct phyp_dump_header)); - } while (rtas_busy_delay(rc)); - - if (rc) { - printk(KERN_ERR "phyp-dump: unexpected error (%d) on " - "register\n", rc); - print_dump_header(ph); - return; - } - - rc = sysfs_create_file(kernel_kobj, &pdl.attr); - if (rc) - printk(KERN_ERR "phyp-dump: unable to create sysfs" - " file (%d)\n", rc); -} - -static -void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr) -{ - int rc; - - /* Add addr value if not initialized before */ - if (ph->cpu_data.destination_address == 0) { - ph->cpu_data.destination_address += addr; - ph->hpte_data.destination_address += addr; - ph->kernel_data.destination_address += addr; - } - - do { - rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, - 2, ph, sizeof(struct phyp_dump_header)); - } while (rtas_busy_delay(rc)); - - if (rc) { - printk(KERN_ERR "phyp-dump: unexpected error (%d) " - "on invalidate\n", rc); - print_dump_header(ph); - } -} - -/* ------------------------------------------------- */ -/** - * release_memory_range -- release memory previously lmb_reserved - * @start_pfn: starting physical frame number - * @nr_pages: number of pages to free. - * - * This routine will release memory that had been previously - * lmb_reserved in early boot. The released memory becomes - * available for genreal use. - */ -static void release_memory_range(unsigned long start_pfn, - unsigned long nr_pages) -{ - struct page *rpage; - unsigned long end_pfn; - long i; - - end_pfn = start_pfn + nr_pages; - - for (i = start_pfn; i <= end_pfn; i++) { - rpage = pfn_to_page(i); - if (PageReserved(rpage)) { - ClearPageReserved(rpage); - init_page_count(rpage); - __free_page(rpage); - totalram_pages++; - } - } -} - -/** - * track_freed_range -- Counts the range being freed. - * Once the counter goes to zero, it re-registers dump for - * future use. - */ -static void -track_freed_range(unsigned long addr, unsigned long length) -{ - static unsigned long scratch_area_size, reserved_area_size; - - if (addr < phyp_dump_info->init_reserve_start) - return; - - if ((addr >= phyp_dump_info->init_reserve_start) && - (addr <= phyp_dump_info->init_reserve_start + - phyp_dump_info->init_reserve_size)) - reserved_area_size += length; - - if ((addr >= phyp_dump_info->reserved_scratch_addr) && - (addr <= phyp_dump_info->reserved_scratch_addr + - phyp_dump_info->reserved_scratch_size)) - scratch_area_size += length; - - if ((reserved_area_size == phyp_dump_info->init_reserve_size) && - (scratch_area_size == phyp_dump_info->reserved_scratch_size)) { - - invalidate_last_dump(&phdr, - phyp_dump_info->reserved_scratch_addr); - register_dump_area(&phdr, - phyp_dump_info->reserved_scratch_addr); - } -} - -/* ------------------------------------------------- */ -/** - * sysfs_release_region -- sysfs interface to release memory range. - * - * Usage: - * "echo > /sys/kernel/release_region" - * - * Example: - * "echo 0x40000000 0x10000000 > /sys/kernel/release_region" - * - * will release 256MB starting at 1GB. - */ -static ssize_t store_release_region(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - unsigned long start_addr, length, end_addr; - unsigned long start_pfn, nr_pages; - ssize_t ret; - - ret = sscanf(buf, "%lx %lx", &start_addr, &length); - if (ret != 2) - return -EINVAL; - - track_freed_range(start_addr, length); - - /* Range-check - don't free any reserved memory that - * wasn't reserved for phyp-dump */ - if (start_addr < phyp_dump_info->init_reserve_start) - start_addr = phyp_dump_info->init_reserve_start; - - end_addr = phyp_dump_info->init_reserve_start + - phyp_dump_info->init_reserve_size; - if (start_addr+length > end_addr) - length = end_addr - start_addr; - - /* Release the region of memory assed in by user */ - start_pfn = PFN_DOWN(start_addr); - nr_pages = PFN_DOWN(length); - release_memory_range(start_pfn, nr_pages); - - return count; -} - -static ssize_t show_release_region(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - u64 second_addr_range; - - /* total reserved size - start of scratch area */ - second_addr_range = phyp_dump_info->init_reserve_size - - phyp_dump_info->reserved_scratch_size; - return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:" - " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n", - phdr.cpu_data.destination_address, - phdr.cpu_data.length_copied, - phdr.hpte_data.destination_address, - phdr.hpte_data.length_copied, - phdr.kernel_data.destination_address, - phdr.kernel_data.length_copied, - phyp_dump_info->init_reserve_start, - second_addr_range); -} - -static struct kobj_attribute rr = __ATTR(release_region, 0600, - show_release_region, - store_release_region); - -static int __init phyp_dump_setup(void) -{ - struct device_node *rtas; - const struct phyp_dump_header *dump_header = NULL; - unsigned long dump_area_start; - unsigned long dump_area_length; - int header_len = 0; - int rc; - - /* If no memory was reserved in early boot, there is nothing to do */ - if (phyp_dump_info->init_reserve_size == 0) - return 0; - - /* Return if phyp dump not supported */ - if (!phyp_dump_info->phyp_dump_configured) - return -ENOSYS; - - /* Is there dump data waiting for us? If there isn't, - * then register a new dump area, and release all of - * the rest of the reserved ram. - * - * The /rtas/ibm,kernel-dump rtas node is present only - * if there is dump data waiting for us. - */ - rtas = of_find_node_by_path("/rtas"); - if (rtas) { - dump_header = of_get_property(rtas, "ibm,kernel-dump", - &header_len); - of_node_put(rtas); - } - - ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump"); - - print_dump_header(dump_header); - dump_area_length = init_dump_header(&phdr); - /* align down */ - dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK; - - if (dump_header == NULL) { - register_dump_area(&phdr, dump_area_start); - return 0; - } - - /* re-register the dump area, if old dump was invalid */ - if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) { - invalidate_last_dump(&phdr, dump_area_start); - register_dump_area(&phdr, dump_area_start); - return 0; - } - - if (dump_header) { - phyp_dump_info->reserved_scratch_addr = - dump_header->cpu_data.destination_address; - phyp_dump_info->reserved_scratch_size = - dump_header->cpu_data.source_length + - dump_header->hpte_data.source_length + - dump_header->kernel_data.source_length; - } - - /* Should we create a dump_subsys, analogous to s390/ipl.c ? */ - rc = sysfs_create_file(kernel_kobj, &rr.attr); - if (rc) - printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n", - rc); - - /* ToDo: re-register the dump area, for next time. */ - return 0; -} -machine_subsys_initcall(pseries, phyp_dump_setup); - -int __init early_init_dt_scan_phyp_dump(unsigned long node, - const char *uname, int depth, void *data) -{ - const unsigned int *sizes; - - phyp_dump_info->phyp_dump_configured = 0; - phyp_dump_info->phyp_dump_is_active = 0; - - if (depth != 1 || strcmp(uname, "rtas") != 0) - return 0; - - if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL)) - phyp_dump_info->phyp_dump_configured++; - - if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL)) - phyp_dump_info->phyp_dump_is_active++; - - sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", - NULL); - if (!sizes) - return 0; - - if (sizes[0] == 1) - phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]); - - if (sizes[3] == 2) - phyp_dump_info->hpte_region_size = - *((unsigned long *)&sizes[4]); - return 1; -} - -/* Look for phyp_dump= cmdline option */ -static int __init early_phyp_dump_enabled(char *p) -{ - phyp_dump_info->phyp_dump_at_boot = 1; - - if (!p) - return 0; - - if (strncmp(p, "1", 1) == 0) - phyp_dump_info->phyp_dump_at_boot = 1; - else if (strncmp(p, "0", 1) == 0) - phyp_dump_info->phyp_dump_at_boot = 0; - - return 0; -} -early_param("phyp_dump", early_phyp_dump_enabled); - -/* Look for phyp_dump_reserve_size= cmdline option */ -static int __init early_phyp_dump_reserve_size(char *p) -{ - if (p) - phyp_dump_info->reserve_bootvar = memparse(p, &p); - - return 0; -} -early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/plpar_wrappers.h linux-2.6.32.ovz/arch/powerpc/platforms/pseries/plpar_wrappers.h --- linux-2.6.32/arch/powerpc/platforms/pseries/plpar_wrappers.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/plpar_wrappers.h 2015-03-10 13:22:52.000000000 -0700 @@ -4,16 +4,46 @@ #include #include +/* Get state of physical CPU from query_cpu_stopped */ +int smp_query_cpu_stopped(unsigned int pcpu); +#define QCSS_STOPPED 0 +#define QCSS_STOPPING 1 +#define QCSS_NOT_STOPPED 2 +#define QCSS_HARDWARE_ERROR -1 +#define QCSS_HARDWARE_BUSY -2 + static inline long poll_pending(void) { return plpar_hcall_norets(H_POLL_PENDING); } +static inline u8 get_cede_latency_hint(void) +{ + return get_lppaca()->gpr5_dword.fields.cede_latency_hint; +} + +static inline void set_cede_latency_hint(u8 latency_hint) +{ + get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint; +} + static inline long cede_processor(void) { return plpar_hcall_norets(H_CEDE); } +static inline long extended_cede_processor(unsigned long latency_hint) +{ + long rc; + u8 old_latency_hint = get_cede_latency_hint(); + + set_cede_latency_hint(latency_hint); + rc = cede_processor(); + set_cede_latency_hint(old_latency_hint); + + return rc; +} + static inline long vpa_call(unsigned long flags, unsigned long cpu, unsigned long vpa) { @@ -43,9 +73,9 @@ static inline long register_slb_shadow(u return vpa_call(0x3, cpu, vpa); } -static inline long unregister_dtl(unsigned long cpu, unsigned long vpa) +static inline long unregister_dtl(unsigned long cpu) { - return vpa_call(0x6, cpu, vpa); + return vpa_call(0x6, cpu, 0); } static inline long register_dtl(unsigned long cpu, unsigned long vpa) @@ -249,4 +279,10 @@ static inline long plpar_xirr(unsigned l return rc; } +#ifdef CONFIG_SUSPEND +int pseries_suspend_cpu(void); +#else +static inline int pseries_suspend_cpu(void) {return 0;} +#endif + #endif /* _PSERIES_PLPAR_WRAPPERS_H */ diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/pseries_energy.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/pseries_energy.c --- linux-2.6.32/arch/powerpc/platforms/pseries/pseries_energy.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/pseries_energy.c 2015-03-10 13:22:01.000000000 -0700 @@ -0,0 +1,358 @@ +/* + * POWER platform energy management driver + * Copyright (C) 2010 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This pseries platform device driver provides access to + * platform energy management capabilities. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define MODULE_VERS "1.0" +#define MODULE_NAME "pseries_energy" + +/* New hcall number */ + +#define H_BEST_ENERGY 0x2F4 + +/* Driver flags */ + +static int sysfs_entries; + +static int nr_threads_per_core; + +/* Helper routines */ + +/* CPU to core index mapping */ + +#define cpu_core_index_of_thread(cpu) ((cpu) / nr_threads_per_core) +#define cpu_first_thread_of_core(core) ((core) * nr_threads_per_core) + +/* Get the threads-per-core from device tree */ + +static void get_threads_per_core(void) +{ + struct device_node *dn = NULL; + int len; + const int *intserv; + + nr_threads_per_core = 1; /* default */ + + dn = of_find_node_by_type(NULL, "cpu"); + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (intserv) + nr_threads_per_core = len / sizeof(int); + + of_node_put(dn); +} + +/* + * Routine to detect firmware support for hcall + * return 1 if H_BEST_ENERGY is supported + * else return 0 + */ + +static int check_for_h_best_energy(void) +{ + struct device_node *rtas = NULL; + const char *hypertas, *s; + int length; + int rc = 0; + + rtas = of_find_node_by_path("/rtas"); + if (!rtas) + return 0; + + hypertas = of_get_property(rtas, "ibm,hypertas-functions", &length); + if (!hypertas) { + of_node_put(rtas); + return 0; + } + + /* hypertas will have list of strings with hcall names */ + for (s = hypertas; s < hypertas + length; s += strlen(s) + 1) { + if (!strncmp("hcall-best-energy-1", s, 19)) { + rc = 1; /* Found the string */ + break; + } + } + of_node_put(rtas); + return rc; +} + +/* Helper Routines to convert between drc_index to cpu numbers */ + +static u32 cpu_to_drc_index(int cpu) +{ + struct device_node *dn = NULL; + const int *indexes; + int i; + int rc = 1; + u32 ret = 0; + + dn = of_find_node_by_path("/cpus"); + if (dn == NULL) + goto err; + indexes = of_get_property(dn, "ibm,drc-indexes", NULL); + if (indexes == NULL) + goto err_of_node_put; + /* Convert logical cpu number to core number */ + i = cpu_core_index_of_thread(cpu); + /* + * The first element indexes[0] is the number of drc_indexes + * returned in the list. Hence i+1 will get the drc_index + * corresponding to core number i. + */ + WARN_ON(i > indexes[0]); + ret = indexes[i + 1]; + rc = 0; + +err_of_node_put: + of_node_put(dn); +err: + if (rc) + printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu); + return ret; +} + +static int drc_index_to_cpu(u32 drc_index) +{ + struct device_node *dn = NULL; + const int *indexes; + int i, cpu = 0; + int rc = 1; + + dn = of_find_node_by_path("/cpus"); + if (dn == NULL) + goto err; + indexes = of_get_property(dn, "ibm,drc-indexes", NULL); + if (indexes == NULL) + goto err_of_node_put; + /* + * First element in the array is the number of drc_indexes + * returned. Search through the list to find the matching + * drc_index and get the core number + */ + for (i = 0; i < indexes[0]; i++) { + if (indexes[i + 1] == drc_index) + break; + } + /* Convert core number to logical cpu number */ + cpu = cpu_first_thread_of_core(i); + rc = 0; + +err_of_node_put: + of_node_put(dn); +err: + if (rc) + printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index); + return cpu; +} + +/* + * pseries hypervisor call H_BEST_ENERGY provides hints to OS on + * preferred logical cpus to activate or deactivate for optimized + * energy consumption. + */ + +#define FLAGS_MODE1 0x004E200000080E01 +#define FLAGS_MODE2 0x004E200000080401 +#define FLAGS_ACTIVATE 0x100 + +static ssize_t get_best_energy_list(char *page, int activate) +{ + int rc, cnt, i, cpu; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long flags = 0; + u32 *buf_page; + char *s = page; + + buf_page = (u32 *) get_zeroed_page(GFP_KERNEL); + if (!buf_page) + return -ENOMEM; + + flags = FLAGS_MODE1; + if (activate) + flags |= FLAGS_ACTIVATE; + + rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page), + 0, 0, 0, 0, 0, 0); + if (rc != H_SUCCESS) { + free_page((unsigned long) buf_page); + return -EINVAL; + } + + cnt = retbuf[0]; + for (i = 0; i < cnt; i++) { + cpu = drc_index_to_cpu(buf_page[2*i+1]); + if ((cpu_online(cpu) && !activate) || + (!cpu_online(cpu) && activate)) + s += sprintf(s, "%d,", cpu); + } + if (s > page) { /* Something to show */ + s--; /* Suppress last comma */ + s += sprintf(s, "\n"); + } + + free_page((unsigned long) buf_page); + return s-page; +} + +static ssize_t get_best_energy_data(struct sys_device *dev, + char *page, int activate) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long flags = 0; + + flags = FLAGS_MODE2; + if (activate) + flags |= FLAGS_ACTIVATE; + + rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, + cpu_to_drc_index(dev->id), + 0, 0, 0, 0, 0, 0, 0); + + if (rc != H_SUCCESS) + return -EINVAL; + + return sprintf(page, "%lu\n", retbuf[1] >> 32); +} + +/* Wrapper functions */ + +static ssize_t cpu_activate_hint_list_show(struct sysdev_class *class, + char *page) +{ + return get_best_energy_list(page, 1); +} + +static ssize_t cpu_deactivate_hint_list_show(struct sysdev_class *class, + char *page) +{ + return get_best_energy_list(page, 0); +} + +static ssize_t percpu_activate_hint_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) +{ + return get_best_energy_data(dev, page, 1); +} + +static ssize_t percpu_deactivate_hint_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) +{ + return get_best_energy_data(dev, page, 0); +} + +/* + * Create sysfs interface: + * /sys/devices/system/cpu/pseries_activate_hint_list + * /sys/devices/system/cpu/pseries_deactivate_hint_list + * Comma separated list of cpus to activate or deactivate + * /sys/devices/system/cpu/cpuN/pseries_activate_hint + * /sys/devices/system/cpu/cpuN/pseries_deactivate_hint + * Per-cpu value of the hint + */ + +struct sysdev_class_attribute attr_cpu_activate_hint_list = + _SYSDEV_CLASS_ATTR(pseries_activate_hint_list, 0444, + cpu_activate_hint_list_show, NULL); + +struct sysdev_class_attribute attr_cpu_deactivate_hint_list = + _SYSDEV_CLASS_ATTR(pseries_deactivate_hint_list, 0444, + cpu_deactivate_hint_list_show, NULL); + +struct sysdev_attribute attr_percpu_activate_hint = + _SYSDEV_ATTR(pseries_activate_hint, 0444, + percpu_activate_hint_show, NULL); + +struct sysdev_attribute attr_percpu_deactivate_hint = + _SYSDEV_ATTR(pseries_deactivate_hint, 0444, + percpu_deactivate_hint_show, NULL); + +static int __init pseries_energy_init(void) +{ + int cpu, err; + struct sys_device *cpu_sys_dev; + + if (!check_for_h_best_energy()) { + printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n"); + return 0; + } + /* Create the sysfs files */ + err = sysfs_create_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_activate_hint_list.attr); + if (!err) + err = sysfs_create_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_deactivate_hint_list.attr); + + if (err) + return err; + + get_threads_per_core(); + + for_each_possible_cpu(cpu) { + cpu_sys_dev = get_cpu_sysdev(cpu); + err = sysfs_create_file(&cpu_sys_dev->kobj, + &attr_percpu_activate_hint.attr); + if (err) + break; + err = sysfs_create_file(&cpu_sys_dev->kobj, + &attr_percpu_deactivate_hint.attr); + if (err) + break; + } + + if (err) + return err; + + sysfs_entries = 1; /* Removed entries on cleanup */ + return 0; + +} + +static void __exit pseries_energy_cleanup(void) +{ + int cpu; + struct sys_device *cpu_sys_dev; + + if (!sysfs_entries) + return; + + /* Remove the sysfs files */ + sysfs_remove_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_activate_hint_list.attr); + + sysfs_remove_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_deactivate_hint_list.attr); + + for_each_possible_cpu(cpu) { + cpu_sys_dev = get_cpu_sysdev(cpu); + sysfs_remove_file(&cpu_sys_dev->kobj, + &attr_percpu_activate_hint.attr); + sysfs_remove_file(&cpu_sys_dev->kobj, + &attr_percpu_deactivate_hint.attr); + } +} + +module_init(pseries_energy_init); +module_exit(pseries_energy_cleanup); +MODULE_DESCRIPTION("Driver for pSeries platform energy management"); +MODULE_AUTHOR("Vaidyanathan Srinivasan"); +MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/ras.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/ras.c --- linux-2.6.32/arch/powerpc/platforms/pseries/ras.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/ras.c 2015-03-10 13:24:05.000000000 -0700 @@ -36,6 +36,8 @@ #include #include #include +#include +#include #include #include @@ -152,26 +154,127 @@ static int __init init_ras_IRQ(void) return 0; } -__initcall(init_ras_IRQ); +subsys_initcall(init_ras_IRQ); -/* - * Handle power subsystem events (EPOW). - * - * Presently we just log the event has occurred. This should be fixed - * to examine the type of power failure and take appropriate action where - * the time horizon permits something useful to be done. - */ +#define EPOW_SHUTDOWN_NORMAL 1 +#define EPOW_SHUTDOWN_ON_UPS 2 +#define EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS 3 +#define EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH 4 + +static void handle_system_shutdown(char event_modifier) +{ + switch (event_modifier) { + case EPOW_SHUTDOWN_NORMAL: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(1); + break; + + case EPOW_SHUTDOWN_ON_UPS: + pr_emerg("Loss of power reported by firmware, system is " + "running on UPS/battery"); + break; + + case EPOW_SHUTDOWN_LOSS_OF_CRITICAL_FUNCTIONS: + pr_emerg("Loss of system critical functions reported by " + "firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(1); + break; + + case EPOW_SHUTDOWN_AMBIENT_TEMPERATURE_TOO_HIGH: + pr_emerg("Ambient temperature too high reported by firmware"); + pr_emerg("Check RTAS error log for details"); + orderly_poweroff(1); + break; + + default: + pr_err("Unknown power/cooling shutdown event (modifier %d)", + event_modifier); + } +} + +struct epow_errorlog { + unsigned char sensor_value; + unsigned char event_modifier; + unsigned char extended_modifier; + unsigned char reserved; + unsigned char platform_reason; +}; + +#define EPOW_RESET 0 +#define EPOW_WARN_COOLING 1 +#define EPOW_WARN_POWER 2 +#define EPOW_SYSTEM_SHUTDOWN 3 +#define EPOW_SYSTEM_HALT 4 +#define EPOW_MAIN_ENCLOSURE 5 +#define EPOW_POWER_OFF 7 + +void rtas_parse_epow_errlog(struct rtas_error_log *log) +{ + struct pseries_errorlog *pseries_log; + struct epow_errorlog *epow_log; + char action_code; + char modifier; + + pseries_log = get_pseries_errorlog(log, PSERIES_ELOG_SECT_ID_EPOW); + if (pseries_log == NULL) + return; + + epow_log = (struct epow_errorlog *)pseries_log->data; + action_code = epow_log->sensor_value & 0xF; /* bottom 4 bits */ + modifier = epow_log->event_modifier & 0xF; /* bottom 4 bits */ + + switch (action_code) { + case EPOW_RESET: + pr_err("Non critical power or cooling issue cleared"); + break; + + case EPOW_WARN_COOLING: + pr_err("Non critical cooling issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_WARN_POWER: + pr_err("Non critical power issue reported by firmware"); + pr_err("Check RTAS error log for details"); + break; + + case EPOW_SYSTEM_SHUTDOWN: + handle_system_shutdown(epow_log->event_modifier); + break; + + case EPOW_SYSTEM_HALT: + pr_emerg("Firmware initiated power off"); + orderly_poweroff(1); + break; + + case EPOW_MAIN_ENCLOSURE: + case EPOW_POWER_OFF: + pr_emerg("Critical power/cooling issue reported by firmware"); + pr_emerg("Check RTAS error log for details"); + pr_emerg("Immediate power off"); + emergency_sync(); + kernel_power_off(); + break; + + default: + pr_err("Unknown power/cooling event (action code %d)", + action_code); + } +} + +/* Handle environmental and power warning (EPOW) interrupts. */ static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) { - int status = 0xdeadbeef; - int state = 0; + int status; + int state; int critical; status = rtas_call(ras_get_sensor_state_token, 2, 2, &state, EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX); if (state > 3) - critical = 1; /* Time Critical */ + critical = 1; /* Time Critical */ else critical = 0; @@ -184,14 +287,10 @@ static irqreturn_t ras_epow_interrupt(in critical, __pa(&ras_log_buf), rtas_get_error_log_max()); - udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n", - *((unsigned long *)&ras_log_buf), status, state); - - /* format and print the extended information */ log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); + rtas_parse_epow_errlog((struct rtas_error_log *)ras_log_buf); + spin_unlock(&ras_log_buf_lock); return IRQ_HANDLED; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/reconfig.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/reconfig.c --- linux-2.6.32/arch/powerpc/platforms/pseries/reconfig.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/reconfig.c 2015-03-10 13:24:13.000000000 -0700 @@ -22,7 +22,7 @@ #include #include - +#include "../../kernel/cacheinfo.h" /* * Routines for "runtime" addition and removal of device tree nodes. @@ -96,17 +96,19 @@ static struct device_node *derive_parent return parent; } -static BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); +BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); int pSeries_reconfig_notifier_register(struct notifier_block *nb) { return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb); } +EXPORT_SYMBOL_GPL(pSeries_reconfig_notifier_register); void pSeries_reconfig_notifier_unregister(struct notifier_block *nb) { blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb); } +EXPORT_SYMBOL_GPL(pSeries_reconfig_notifier_unregister); static int pSeries_reconfig_add_node(const char *path, struct property *proplist) { @@ -422,6 +424,7 @@ static int do_remove_property(char *buf, static int do_update_property(char *buf, size_t bufsize) { struct device_node *np; + struct pSeries_reconfig_prop_update upd_value; unsigned char *value; char *name, *end, *next_prop; int rc, length; @@ -450,6 +453,11 @@ static int do_update_property(char *buf, return -ENODEV; } + upd_value.node = np; + upd_value.property = newprop; + blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_UPDATE_PROPERTY, &upd_value); + rc = prom_update_property(np, newprop, oldprop); if (rc) return rc; @@ -484,6 +492,28 @@ static int do_update_property(char *buf, return 0; } +static int pSeries_reconfig_add_cache_list(void) +{ + int cpu; + + for_each_online_cpu(cpu) + cacheinfo_cpu_online(cpu); + + return 0; +} + +static int pSeries_reconfig_delete_cache_list(void) +{ + int cpu; + + for_each_online_cpu(cpu) + cacheinfo_cpu_offline(cpu); + + return 0; + +} + + /** * ofdt_write - perform operations on the Open Firmware device tree * @@ -532,6 +562,10 @@ static ssize_t ofdt_write(struct file *f rv = do_remove_property(tmp, count - (tmp - kbuf)); else if (!strcmp(kbuf, "update_property")) rv = do_update_property(tmp, count - (tmp - kbuf)); + else if (!strcmp(kbuf, "start_update")) + rv = pSeries_reconfig_delete_cache_list(); + else if (!strcmp(kbuf, "end_update")) + rv = pSeries_reconfig_add_cache_list(); else rv = -EINVAL; out: diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/rtasd.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/rtasd.c --- linux-2.6.32/arch/powerpc/platforms/pseries/rtasd.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/rtasd.c 2015-03-10 13:22:59.000000000 -0700 @@ -453,6 +453,13 @@ static void start_event_scan(void) event_scan_delay); } +/* Cancel the rtas event scan work */ +void rtas_cancel_event_scan(void) +{ + cancel_delayed_work_sync(&event_scan_work); +} +EXPORT_SYMBOL_GPL(rtas_cancel_event_scan); + static int __init rtas_init(void) { struct proc_dir_entry *entry; diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/setup.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/setup.c --- linux-2.6.32/arch/powerpc/platforms/pseries/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/setup.c 2015-03-10 13:23:53.000000000 -0700 @@ -274,8 +274,71 @@ static struct notifier_block pci_dn_reco .notifier_call = pci_dn_reconfig_notifier, }; +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +/* + * Allocate space for the dispatch trace log for all possible cpus + * and register the buffers with the hypervisor. This is used for + * computing time stolen by the hypervisor. + */ +static int alloc_dispatch_logs(void) +{ + int cpu, ret; + struct paca_struct *pp; + struct dtl_entry *dtl; + struct kmem_cache *dtl_cache; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, + DISPATCH_LOG_BYTES, 0, NULL); + if (!dtl_cache) { + pr_warn("Failed to create dispatch trace log buffer cache\n"); + pr_warn("Stolen time statistics will be unreliable\n"); + return 0; + } + + for_each_possible_cpu(cpu) { + pp = &paca[cpu]; + dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL); + if (!dtl) { + pr_warning("Failed to allocate dispatch trace log for cpu %d\n", + cpu); + pr_warning("Stolen time statistics will be unreliable\n"); + break; + } + + pp->dtl_ridx = 0; + pp->dispatch_log = dtl; + pp->dispatch_log_end = dtl + N_DISPATCH_LOG; + pp->dtl_curr = dtl; + } + + /* Register the DTL for the current (boot) cpu */ + dtl = get_paca()->dispatch_log; + get_paca()->dtl_ridx = 0; + get_paca()->dtl_curr = dtl; + get_paca()->lppaca_ptr->dtl_idx = 0; + + /* hypervisor reads buffer length from this field */ + dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + ret = register_dtl(hard_smp_processor_id(), __pa(dtl)); + if (ret) + pr_warning("DTL registration failed for boot cpu %d (%d)\n", + smp_processor_id(), ret); + get_paca()->lppaca_ptr->dtl_enable_mask = 2; + + return 0; +} + +early_initcall(alloc_dispatch_logs); +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + static void __init pSeries_setup_arch(void) { + if (!strncmp(cur_cpu_spec->platform, "power5", 6)) + mark_hardware_unsupported("Power5 Processor"); + /* Discover PIC type and setup ppc_md accordingly */ pseries_discover_pic(); @@ -288,6 +351,9 @@ static void __init pSeries_setup_arch(vo fwnmi_init(); + /* By default, only probe PCI (can be overriden by rtas_pci) */ + pci_add_flags(PCI_PROBE_ONLY); + /* Find and initialize PCI host bridges */ init_pci_config_tokens(); find_and_init_phbs(); @@ -341,6 +407,16 @@ static int pseries_set_xdabr(unsigned lo #define CMO_CHARACTERISTICS_TOKEN 44 #define CMO_MAXLENGTH 1026 +void pSeries_coalesce_init(void) +{ + struct hvcall_mpp_x_data mpp_x_data; + + if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data)) + powerpc_firmware_features |= FW_FEATURE_XCMO; + else + powerpc_firmware_features &= ~FW_FEATURE_XCMO; +} + /** * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions, * handle that here. (Stolen from parse_system_parameter_string) @@ -410,6 +486,7 @@ void pSeries_cmo_feature_init(void) pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, CMO_SecPSP); powerpc_firmware_features |= FW_FEATURE_CMO; + pSeries_coalesce_init(); } else pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, CMO_SecPSP); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/smp.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/smp.c --- linux-2.6.32/arch/powerpc/platforms/pseries/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/smp.c 2015-03-10 13:21:50.000000000 -0700 @@ -48,6 +48,7 @@ #include "plpar_wrappers.h" #include "pseries.h" #include "xics.h" +#include "offline_states.h" /* @@ -56,6 +57,28 @@ */ static cpumask_t of_spin_map; +/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ +int smp_query_cpu_stopped(unsigned int pcpu) +{ + int cpu_status, status; + int qcss_tok = rtas_token("query-cpu-stopped-state"); + + if (qcss_tok == RTAS_UNKNOWN_SERVICE) { + printk(KERN_INFO "Firmware doesn't support " + "query-cpu-stopped-state\n"); + return QCSS_HARDWARE_ERROR; + } + + status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu); + if (status != 0) { + printk(KERN_ERR + "RTAS query-cpu-stopped-state failed: %i\n", status); + return status; + } + + return cpu_status; +} + /** * smp_startup_cpu() - start the given cpu * @@ -81,9 +104,18 @@ static inline int __devinit smp_startup_ pcpu = get_hard_smp_processor_id(lcpu); + /* Check to see if the CPU out of FW already for kexec */ + if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){ + cpu_set(lcpu, of_spin_map); + return 1; + } + /* Fixup atomic count: it exited inside IRQ handler. */ task_thread_info(paca[lcpu].__current)->preempt_count = 0; + if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) + goto out; + /* * If the RTAS start-cpu token does not exist then presume the * cpu is already spinning. @@ -98,6 +130,7 @@ static inline int __devinit smp_startup_ return 0; } +out: return 1; } @@ -111,12 +144,16 @@ static void __devinit smp_xics_setup_cpu vpa_init(cpu); cpu_clear(cpu, of_spin_map); + set_cpu_current_state(cpu, CPU_STATE_ONLINE); + set_default_offline_state(cpu); } #endif /* CONFIG_XICS */ static void __devinit smp_pSeries_kick_cpu(int nr) { + long rc; + unsigned long hcpuid; BUG_ON(nr < 0 || nr >= NR_CPUS); if (!smp_startup_cpu(nr)) @@ -128,6 +165,16 @@ static void __devinit smp_pSeries_kick_c * the processor will continue on to secondary_start */ paca[nr].cpu_start = 1; + + set_preferred_offline_state(nr, CPU_STATE_ONLINE); + + if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { + hcpuid = get_hard_smp_processor_id(nr); + rc = plpar_hcall_norets(H_PROD, hcpuid); + if (rc != H_SUCCESS) + printk(KERN_ERR "Error: Prod to wake up processor %d\ + Ret= %ld\n", nr, rc); + } } static int smp_pSeries_cpu_bootable(unsigned int nr) @@ -135,10 +182,13 @@ static int smp_pSeries_cpu_bootable(unsi /* Special case - we inhibit secondary thread startup * during boot if the user requests it. */ - if (system_state < SYSTEM_RUNNING && - cpu_has_feature(CPU_FTR_SMT) && - !smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) - return 0; + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { + if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) + return 0; + if (smt_enabled_at_boot + && cpu_thread_in_core(nr) >= smt_enabled_at_boot) + return 0; + } return 1; } diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/suspend.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/suspend.c --- linux-2.6.32/arch/powerpc/platforms/pseries/suspend.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/suspend.c 2015-03-10 13:24:04.000000000 -0700 @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2010 Brian King IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u64 stream_id; +static struct sys_device suspend_sysdev; +static DECLARE_COMPLETION(suspend_work); +static struct rtas_suspend_me_data suspend_data; +static int suspending; + +/** + * pseries_suspend_begin - First phase of hibernation + * + * Check to ensure we are in a valid state to hibernate + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_begin(suspend_state_t state) +{ + long vasi_state, rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + /* Make sure the state is valid */ + rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id); + + vasi_state = retbuf[0]; + + if (rc) { + pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc); + return rc; + } else if (vasi_state == H_VASI_ENABLED) { + return -EAGAIN; + } else if (vasi_state != H_VASI_SUSPENDING) { + pr_err("pseries_suspend_begin: vasi_state returned state %ld\n", + vasi_state); + return -EIO; + } + + return 0; +} + +/** + * pseries_suspend_cpu - Suspend a single CPU + * + * Makes the H_JOIN call to suspend the CPU + * + **/ +int pseries_suspend_cpu(void) +{ + if (suspending) + return rtas_suspend_cpu(&suspend_data); + return 0; +} + +/** + * pseries_suspend_enter - Final phase of hibernation + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_enter(suspend_state_t state) +{ + int rc = rtas_suspend_last_cpu(&suspend_data); + + smp_wmb(); /* Ensure done is ordered wrt suspend_data.error */ + suspending = 0; + suspend_data.done = 1; + + /* Ensure suspending and suspend_data.done is seen on all CPUs, + since they will be waking up shortly */ + mb(); + return rc; +} + +/** + * pseries_prepare_late - Prepare to suspend all other CPUs + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_prepare_late(void) +{ + suspending = 1; + atomic_set(&suspend_data.working, 0); + suspend_data.done = 0; + suspend_data.error = 0; + suspend_data.complete = &suspend_work; + INIT_COMPLETION(suspend_work); + + /* Ensure suspending and suspend_data.done is seen on all CPUs */ + mb(); + return 0; +} + +/** + * store_hibernate - Initiate partition hibernation + * @classdev: sysdev class struct + * @attr: class device attribute struct + * @buf: buffer + * @count: buffer size + * + * Write the stream ID received from the HMC to this file + * to trigger hibernating the partition + * + * Return value: + * number of bytes printed to buffer / other on failure + **/ +static ssize_t store_hibernate(struct sysdev_class *classdev, + const char *buf, size_t count) +{ + cpumask_var_t offline_mask; + int rc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!alloc_cpumask_var(&offline_mask, GFP_TEMPORARY)) + return -ENOMEM; + + stream_id = simple_strtoul(buf, NULL, 16); + + do { + rc = pseries_suspend_begin(PM_SUSPEND_MEM); + if (rc == -EAGAIN) + ssleep(1); + } while (rc == -EAGAIN); + + if (!rc) { + /* All present CPUs must be online */ + cpumask_andnot(offline_mask, cpu_present_mask, + cpu_online_mask); + rc = rtas_online_cpus_mask(offline_mask); + if (rc) { + pr_err("%s: Could not bring present CPUs online.\n", + __func__); + goto out; + } + + stop_topology_update(); + rc = pm_suspend(PM_SUSPEND_MEM); + start_topology_update(); + + /* Take down CPUs not online prior to suspend */ + if (!rtas_offline_cpus_mask(offline_mask)) + pr_warn("%s: Could not restore CPUs to offline " + "state.\n", __func__); + } + + stream_id = 0; + + if (!rc) + rc = count; +out: + free_cpumask_var(offline_mask); + return rc; +} + +static SYSDEV_CLASS_ATTR(hibernate, S_IWUSR, NULL, store_hibernate); + +static struct sysdev_class suspend_sysdev_class = { + .name = "power", +}; + +static struct platform_suspend_ops pseries_suspend_ops = { + .valid = suspend_valid_only_mem, + .begin = pseries_suspend_begin, + .prepare_late = pseries_prepare_late, + .enter = pseries_suspend_enter, +}; + +/** + * pseries_suspend_sysfs_register - Register with sysfs + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_sysfs_register(struct sys_device *sysdev) +{ + int rc; + + if ((rc = sysdev_class_register(&suspend_sysdev_class))) + return rc; + + sysdev->id = 0; + sysdev->cls = &suspend_sysdev_class; + + if ((rc = sysdev_class_create_file(&suspend_sysdev_class, &attr_hibernate))) + goto class_unregister; + + return 0; + +class_unregister: + sysdev_class_unregister(&suspend_sysdev_class); + return rc; +} + +/** + * pseries_suspend_init - initcall for pSeries suspend + * + * Return value: + * 0 on success / other on failure + **/ +static int __init pseries_suspend_init(void) +{ + int rc; + + if (!machine_is(pseries) || !firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + suspend_data.token = rtas_token("ibm,suspend-me"); + if (suspend_data.token == RTAS_UNKNOWN_SERVICE) + return 0; + + if ((rc = pseries_suspend_sysfs_register(&suspend_sysdev))) + return rc; + + suspend_set_ops(&pseries_suspend_ops); + return 0; +} + +__initcall(pseries_suspend_init); diff -uprN linux-2.6.32/arch/powerpc/platforms/pseries/xics.c linux-2.6.32.ovz/arch/powerpc/platforms/pseries/xics.c --- linux-2.6.32/arch/powerpc/platforms/pseries/xics.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/platforms/pseries/xics.c 2015-03-10 13:24:14.000000000 -0700 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -46,6 +47,12 @@ static struct irq_host *xics_host; */ #define IPI_PRIORITY 4 +/* The least favored priority */ +#define LOWEST_PRIORITY 0xFF + +/* The number of priorities defined above */ +#define MAX_NUM_PRIORITIES 3 + static unsigned int default_server = 0xFF; static unsigned int default_distrib_server = 0; static unsigned int interrupt_server_size = 8; @@ -56,6 +63,12 @@ static int ibm_set_xive; static int ibm_int_on; static int ibm_int_off; +struct xics_cppr { + unsigned char stack[MAX_NUM_PRIORITIES]; + int index; +}; + +static DEFINE_PER_CPU(struct xics_cppr, xics_cppr); /* Direct hardware low level accessors */ @@ -150,18 +163,17 @@ static inline void lpar_qirr_info(int n_ /* Interface to generic irq subsystem */ #ifdef CONFIG_SMP -static int get_irq_server(unsigned int virq, unsigned int strict_check) +static int get_irq_server(unsigned int virq, cpumask_t cpumask, + unsigned int strict_check) { int server; /* For the moment only implement delivery to all cpus or one cpu */ - cpumask_t cpumask; cpumask_t tmp = CPU_MASK_NONE; - cpumask_copy(&cpumask, irq_desc[virq].affinity); if (!distribute_irqs) return default_server; - if (!cpus_equal(cpumask, CPU_MASK_ALL)) { + if (!cpus_subset(cpu_possible_map, cpumask)) { cpus_and(tmp, cpu_online_map, cpumask); server = first_cpu(tmp); @@ -179,7 +191,8 @@ static int get_irq_server(unsigned int v return default_server; } #else -static int get_irq_server(unsigned int virq, unsigned int strict_check) +static int get_irq_server(unsigned int virq, cpumask_t cpumask, + unsigned int strict_check) { return default_server; } @@ -198,7 +211,7 @@ static void xics_unmask_irq(unsigned int if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS) return; - server = get_irq_server(virq, 0); + server = get_irq_server(virq, *(irq_to_desc(virq)->affinity), 0); call_status = rtas_call(ibm_set_xive, 3, 1, NULL, irq, server, DEFAULT_PRIORITY); @@ -284,6 +297,19 @@ static inline unsigned int xics_xirr_vec return xirr & 0x00ffffff; } +static void push_cppr(unsigned int vec) +{ + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + if (WARN_ON(os_cppr->index >= MAX_NUM_PRIORITIES - 1)) + return; + + if (vec == XICS_IPI) + os_cppr->stack[++os_cppr->index] = IPI_PRIORITY; + else + os_cppr->stack[++os_cppr->index] = DEFAULT_PRIORITY; +} + static unsigned int xics_get_irq_direct(void) { unsigned int xirr = direct_xirr_info_get(); @@ -294,8 +320,10 @@ static unsigned int xics_get_irq_direct( return NO_IRQ; irq = irq_radix_revmap_lookup(xics_host, vec); - if (likely(irq != NO_IRQ)) + if (likely(irq != NO_IRQ)) { + push_cppr(vec); return irq; + } /* We don't have a linux mapping, so have rtas mask it. */ xics_mask_unknown_vec(vec); @@ -315,8 +343,10 @@ static unsigned int xics_get_irq_lpar(vo return NO_IRQ; irq = irq_radix_revmap_lookup(xics_host, vec); - if (likely(irq != NO_IRQ)) + if (likely(irq != NO_IRQ)) { + push_cppr(vec); return irq; + } /* We don't have a linux mapping, so have RTAS mask it. */ xics_mask_unknown_vec(vec); @@ -326,12 +356,22 @@ static unsigned int xics_get_irq_lpar(vo return NO_IRQ; } +static unsigned char pop_cppr(void) +{ + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + if (WARN_ON(os_cppr->index < 1)) + return LOWEST_PRIORITY; + + return os_cppr->stack[--os_cppr->index]; +} + static void xics_eoi_direct(unsigned int virq) { unsigned int irq = (unsigned int)irq_map[virq].hwirq; iosync(); - direct_xirr_info_set((0xff << 24) | irq); + direct_xirr_info_set((pop_cppr() << 24) | irq); } static void xics_eoi_lpar(unsigned int virq) @@ -339,7 +379,7 @@ static void xics_eoi_lpar(unsigned int v unsigned int irq = (unsigned int)irq_map[virq].hwirq; iosync(); - lpar_xirr_info_set((0xff << 24) | irq); + lpar_xirr_info_set((pop_cppr() << 24) | irq); } static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask) @@ -365,7 +405,7 @@ static int xics_set_affinity(unsigned in * For the moment only implement delivery to all cpus or one cpu. * Get current irq_server for the given irq */ - irq_server = get_irq_server(virq, 1); + irq_server = get_irq_server(virq, *cpumask, 1); if (irq_server == -1) { char cpulist[128]; cpumask_scnprintf(cpulist, sizeof(cpulist), cpumask); @@ -508,8 +548,6 @@ void smp_xics_message_pass(int target, i static irqreturn_t xics_ipi_dispatch(int cpu) { - WARN_ON(cpu_is_offline(cpu)); - mb(); /* order mmio clearing qirr */ while (xics_ipi_message[cpu].value) { if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION, @@ -746,6 +784,16 @@ void __init xics_init_IRQ(void) static void xics_set_cpu_priority(unsigned char cppr) { + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); + + /* + * we only really want to set the priority when there's + * just one cppr value on the stack + */ + WARN_ON(os_cppr->index != 0); + + os_cppr->stack[0] = cppr; + if (firmware_has_feature(FW_FEATURE_LPAR)) lpar_cppr_info(cppr); else @@ -772,15 +820,21 @@ static void xics_set_cpu_giq(unsigned in void xics_setup_cpu(void) { - xics_set_cpu_priority(0xff); + xics_set_cpu_priority(LOWEST_PRIORITY); xics_set_cpu_giq(default_distrib_server, 1); } void xics_teardown_cpu(void) { + struct xics_cppr *os_cppr = &__get_cpu_var(xics_cppr); int cpu = smp_processor_id(); + /* + * we have to reset the cppr index to 0 because we're + * not going to return from the IPI + */ + os_cppr->index = 0; xics_set_cpu_priority(0); /* Clear any pending IPI request */ diff -uprN linux-2.6.32/arch/powerpc/sysdev/dart_iommu.c linux-2.6.32.ovz/arch/powerpc/sysdev/dart_iommu.c --- linux-2.6.32/arch/powerpc/sysdev/dart_iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/dart_iommu.c 2015-03-10 13:23:58.000000000 -0700 @@ -72,11 +72,16 @@ static int dart_is_u4; #define DBG(...) +static DEFINE_SPINLOCK(invalidate_lock); + static inline void dart_tlb_invalidate_all(void) { unsigned long l = 0; unsigned int reg, inv_bit; unsigned long limit; + unsigned long flags; + + spin_lock_irqsave(&invalidate_lock, flags); DBG("dart: flush\n"); @@ -109,12 +114,17 @@ retry: panic("DART: TLB did not flush after waiting a long " "time. Buggy U3 ?"); } + + spin_unlock_irqrestore(&invalidate_lock, flags); } static inline void dart_tlb_invalidate_one(unsigned long bus_rpn) { unsigned int reg; unsigned int l, limit; + unsigned long flags; + + spin_lock_irqsave(&invalidate_lock, flags); reg = DART_CNTL_U4_ENABLE | DART_CNTL_U4_IONE | (bus_rpn & DART_CNTL_U4_IONE_MASK); @@ -136,6 +146,8 @@ wait_more: panic("DART: TLB did not flush after waiting a long " "time. Buggy U4 ?"); } + + spin_unlock_irqrestore(&invalidate_lock, flags); } static void dart_flush(struct iommu_table *tbl) diff -uprN linux-2.6.32/arch/powerpc/sysdev/fsl_pci.c linux-2.6.32.ovz/arch/powerpc/sysdev/fsl_pci.c --- linux-2.6.32/arch/powerpc/sysdev/fsl_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/fsl_pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -330,7 +330,7 @@ int __init fsl_add_bridge(struct device_ printk(KERN_WARNING "Can't get bus-range for %s, assume" " bus 0\n", dev->full_name); - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); hose = pcibios_alloc_controller(dev); if (!hose) return -ENOMEM; @@ -392,8 +392,22 @@ DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEV DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8641, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8641D, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_MPC8610, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1011E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1011, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1013E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1013, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1020E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1020, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1022E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P1022, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2010E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2010, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2020E, quirk_fsl_pcie_header); DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P2020, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4040E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4040, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4080E, quirk_fsl_pcie_header); +DECLARE_PCI_FIXUP_HEADER(0x1957, PCI_DEVICE_ID_P4080, quirk_fsl_pcie_header); #endif /* CONFIG_PPC_85xx || CONFIG_PPC_86xx */ #if defined(CONFIG_PPC_83xx) || defined(CONFIG_PPC_MPC512x) @@ -626,7 +640,7 @@ int __init mpc83xx_add_bridge(struct dev " bus 0\n", dev->full_name); } - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); hose = pcibios_alloc_controller(dev); if (!hose) return -ENOMEM; diff -uprN linux-2.6.32/arch/powerpc/sysdev/grackle.c linux-2.6.32.ovz/arch/powerpc/sysdev/grackle.c --- linux-2.6.32/arch/powerpc/sysdev/grackle.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/grackle.c 2015-03-10 13:23:53.000000000 -0700 @@ -57,7 +57,7 @@ void __init setup_grackle(struct pci_con { setup_indirect_pci(hose, 0xfec00000, 0xfee00000, 0); if (machine_is_compatible("PowerMac1,1")) - ppc_pci_add_flags(PPC_PCI_REASSIGN_ALL_BUS); + pci_add_flags(PCI_REASSIGN_ALL_BUS); if (machine_is_compatible("AAPL,PowerBook1998")) grackle_set_loop_snoop(hose, 1); #if 0 /* Disabled for now, HW problems ??? */ diff -uprN linux-2.6.32/arch/powerpc/sysdev/mpic.c linux-2.6.32.ovz/arch/powerpc/sysdev/mpic.c --- linux-2.6.32/arch/powerpc/sysdev/mpic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/mpic.c 2015-03-10 13:23:03.000000000 -0700 @@ -567,13 +567,11 @@ static void __init mpic_scan_ht_pics(str #endif /* CONFIG_MPIC_U3_HT_IRQS */ #ifdef CONFIG_SMP -static int irq_choose_cpu(unsigned int virt_irq) +static int irq_choose_cpu(const cpumask_t *mask) { - cpumask_t mask; int cpuid; - cpumask_copy(&mask, irq_desc[virt_irq].affinity); - if (cpus_equal(mask, CPU_MASK_ALL)) { + if (cpumask_equal(mask, cpu_all_mask)) { static int irq_rover; static DEFINE_SPINLOCK(irq_rover_lock); unsigned long flags; @@ -594,20 +592,15 @@ static int irq_choose_cpu(unsigned int v spin_unlock_irqrestore(&irq_rover_lock, flags); } else { - cpumask_t tmp; - - cpus_and(tmp, cpu_online_map, mask); - - if (cpus_empty(tmp)) + cpuid = cpumask_first_and(mask, cpu_online_mask); + if (cpuid >= nr_cpu_ids) goto do_round_robin; - - cpuid = first_cpu(tmp); } return get_hard_smp_processor_id(cpuid); } #else -static int irq_choose_cpu(unsigned int virt_irq) +static int irq_choose_cpu(const cpumask_t *mask) { return hard_smp_processor_id(); } @@ -816,7 +809,7 @@ int mpic_set_affinity(unsigned int irq, unsigned int src = mpic_irq_to_hw(irq); if (mpic->flags & MPIC_SINGLE_DEST_CPU) { - int cpuid = irq_choose_cpu(irq); + int cpuid = irq_choose_cpu(cpumask); mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION), 1 << cpuid); } else { diff -uprN linux-2.6.32/arch/powerpc/sysdev/mv64x60_pci.c linux-2.6.32.ovz/arch/powerpc/sysdev/mv64x60_pci.c --- linux-2.6.32/arch/powerpc/sysdev/mv64x60_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/mv64x60_pci.c 2015-03-10 13:21:32.000000000 -0700 @@ -24,7 +24,7 @@ #define MV64X60_VAL_LEN_MAX 11 #define MV64X60_PCICFG_CPCI_HOTSWAP 0x68 -static ssize_t mv64x60_hs_reg_read(struct kobject *kobj, +static ssize_t mv64x60_hs_reg_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { @@ -45,7 +45,7 @@ static ssize_t mv64x60_hs_reg_read(struc return sprintf(buf, "0x%08x\n", v); } -static ssize_t mv64x60_hs_reg_write(struct kobject *kobj, +static ssize_t mv64x60_hs_reg_write(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { diff -uprN linux-2.6.32/arch/powerpc/sysdev/ppc4xx_pci.c linux-2.6.32.ovz/arch/powerpc/sysdev/ppc4xx_pci.c --- linux-2.6.32/arch/powerpc/sysdev/ppc4xx_pci.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/sysdev/ppc4xx_pci.c 2015-03-10 13:23:53.000000000 -0700 @@ -1839,7 +1839,7 @@ static int __init ppc4xx_pci_find_bridge { struct device_node *np; - ppc_pci_flags |= PPC_PCI_ENABLE_PROC_DOMAINS | PPC_PCI_COMPAT_DOMAIN_0; + pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); #ifdef CONFIG_PPC4xx_PCI_EXPRESS for_each_compatible_node(np, NULL, "ibm,plb-pciex") diff -uprN linux-2.6.32/arch/powerpc/xmon/xmon.c linux-2.6.32.ovz/arch/powerpc/xmon/xmon.c --- linux-2.6.32/arch/powerpc/xmon/xmon.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/powerpc/xmon/xmon.c 2015-03-10 13:22:29.000000000 -0700 @@ -818,7 +818,7 @@ cmds(struct pt_regs *excp) memzcan(); break; case 'i': - show_mem(); + show_mem(0); break; default: termch = cmd; @@ -1641,7 +1641,8 @@ static void super_regs(void) ptrLpPaca->saved_srr0, ptrLpPaca->saved_srr1); printf(" Saved Gpr3=%.16lx Saved Gpr4=%.16lx \n", ptrLpPaca->saved_gpr3, ptrLpPaca->saved_gpr4); - printf(" Saved Gpr5=%.16lx \n", ptrLpPaca->saved_gpr5); + printf(" Saved Gpr5=%.16lx \n", + ptrLpPaca->gpr5_dword.saved_gpr5); } #endif diff -uprN linux-2.6.32/arch/s390/boot/zfcpdump.c linux-2.6.32.ovz/arch/s390/boot/zfcpdump.c --- linux-2.6.32/arch/s390/boot/zfcpdump.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/boot/zfcpdump.c 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,1141 @@ +/* + * zfcpdump userspace tool + * + * This tool should be used in an intitramfs together with a kernel with + * enabled CONFIG_ZFCPDUMP kernel build option. The tool is able to write + * standalone system dumps on SCSI disks. + * + * See Documentation/s390/zfcpdump.txt for more information! + * + * Copyright IBM Corp. 2003, 2007. + * Author(s): Michael Holzheu + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfcpdump.h" +#ifdef GZIP_SUPPORT +#include +#endif + +static struct globals g; +static char *module_list[] = {"zfcp", "sd_mod", "ext2", "ext3", "zcore_mod", + NULL}; + +/* + * parse one kernel parameter in the form keyword=value + */ +static int parse_parameter(char *parameter) +{ + char *token; + char *end_ptr; + + token = strtok(parameter, "="); + if (token == NULL) + return 0; + + if (strcmp(token, PARM_DIR) == 0) { + /* Dump Dir */ + g.parm_dir = strtok(NULL, "="); + if (g.parm_dir == NULL) { + PRINT_WARN("No value for '%s' parameter specified\n", + PARM_DIR); + PRINT_WARN("Using default: %s\n", PARM_DIR_DFLT); + g.parm_dir = PARM_DIR_DFLT; + } + } else if (strcmp(token, PARM_PART) == 0) { + /* Dump Partition */ + g.parm_part = strtok(NULL, "="); + if (g.parm_part == NULL) { + PRINT_ERR("No value for '%s' parameter " + "specified\n", PARM_PART); + return -1; + } + } else if (strcmp(token, PARM_MEM) == 0) { + /* Dump mem */ + char *mem_str = strtok(NULL, "="); + if (mem_str == NULL) { + PRINT_ERR("No value for '%s' parameter " + "specified\n", PARM_MEM); + return -1; + } + g.parm_mem = strtoll(mem_str, &end_ptr, 0); + if (*end_ptr != 0) { + PRINT_ERR("Invalid value for '%s' parameter " + "specified\n", PARM_MEM); + return -1; + } + } else if (strcmp(token, PARM_COMP) == 0) { + /* Dump Compression */ + g.parm_compress = strtok(NULL, "="); + if (g.parm_compress == NULL) { + PRINT_WARN("No value for '%s' parameter " + "specified\n", PARM_COMP); + PRINT_WARN("Using default: %s\n", PARM_COMP_DFLT); + g.parm_compress = PARM_COMP_DFLT; + } else if ((strcmp(g.parm_compress, PARM_COMP_GZIP) != 0) && + (strcmp(g.parm_compress, PARM_COMP_NONE) != 0)) { + PRINT_WARN("Unknown dump compression '%s' " + "specified!\n", g.parm_compress); + PRINT_WARN("Using default: %s\n", PARM_COMP_DFLT); + g.parm_compress = PARM_COMP_DFLT; + } + } else if (strcmp(token, PARM_DEBUG) == 0) { + /* Dump Debug */ + char *s = strtok(NULL, "="); + if (s == NULL) { + PRINT_WARN("No value for '%s' parameter " + "specified\n", PARM_DEBUG); + PRINT_WARN("Using default: %d\n", PARM_DEBUG_DFLT); + } else { + g.parm_debug = atoi(s); + if ((g.parm_debug < PARM_DEBUG_MIN) || + (g.parm_debug > PARM_DEBUG_MAX)) { + PRINT_WARN("Invalid value (%i) for %s " + "parameter specified (allowed range is " + "%i - %i)\n", g.parm_debug, PARM_DEBUG, + PARM_DEBUG_MIN, PARM_DEBUG_MAX); + PRINT_WARN("Using default: %i\n", + PARM_DEBUG_DFLT); + g.parm_debug = PARM_DEBUG_DFLT; + } + } + } else if (strcmp(token, PARM_MODE) == 0) { + /* Dump Mode */ + char *s = strtok(NULL, "="); + if (s == NULL) { + PRINT_WARN("No value for '%s' parameter " + "specified\n", PARM_MODE); + PRINT_WARN("Using default: %s\n", PARM_MODE_DFLT); + } else if (strcmp(s, PARM_MODE_INTERACT) == 0) { + g.parm_mode = PARM_MODE_INTERACT_NUM; + } else if (strcmp(s, PARM_MODE_AUTO) == 0) { + g.parm_mode = PARM_MODE_AUTO_NUM; + } else { + PRINT_WARN("Unknown dump mode: %s\n", s); + PRINT_WARN("Using default: %s\n", PARM_MODE_DFLT); + } + } + return 0; +} + +/* + * Get dump parameters from /proc/cmdline + * Return: 0 - ok + * (!= 0) - error + */ +static int parse_parmline(void) +{ + int fh, i, count, token_cnt; + char *token; + char *parms[KERN_PARM_MAX]; + + /* setting defaults */ + + g.parm_compress = PARM_COMP_DFLT; + g.parm_dir = PARM_DIR_DFLT; + g.parm_part = PARM_PART_DFLT; + g.parm_debug = PARM_DEBUG_DFLT; + g.parm_mode = PARM_MODE_NUM_DFLT; + g.parm_mem = PARM_MEM_DFLT; + + fh = open(PROC_CMDLINE, O_RDONLY); + if (fh == -1) { + PRINT_PERR("open %s failed\n", PROC_CMDLINE); + return -1; + } + count = read(fh, g.parmline, CMDLINE_MAX_LEN); + if (count == -1) { + PRINT_PERR("read %s failed\n", PROC_CMDLINE); + close(fh); + return -1; + } + g.parmline[count-1] = '\0'; /* remove \n */ + token_cnt = 0; + token = strtok(g.parmline, " \t\n"); + while (token != NULL) { + parms[token_cnt] = token; + token = strtok(NULL, " \t\n"); + token_cnt++; + if (token_cnt >= KERN_PARM_MAX) { + PRINT_WARN("More than %i kernel parmameters " + "specified\n", KERN_PARM_MAX); + break; + } + } + for (i = 0; i < token_cnt; i++) { + if (parse_parameter(parms[i])) { + close(fh); + return -1; + } + } + PRINT_TRACE("dump dir : %s\n", g.parm_dir); + PRINT_TRACE("dump part : %s\n", g.parm_part); + PRINT_TRACE("dump comp : %s\n", g.parm_compress); + PRINT_TRACE("dump debug: %d\n", g.parm_debug); + PRINT_TRACE("dump mem: %llx\n", (unsigned long long) g.parm_mem); + + if (g.parm_mode == PARM_MODE_AUTO_NUM) + PRINT_TRACE("dump mode : %s\n", PARM_MODE_AUTO); + if (g.parm_mode == PARM_MODE_INTERACT_NUM) + PRINT_TRACE("dump mode : %s\n", PARM_MODE_INTERACT); + + sprintf(g.dump_dir, "%s/%s", DUMP_DIR, g.parm_dir); + close(fh); + return 0; +} + +static int write_to_file(const char *file, const char *command) +{ + int fh; + + PRINT_TRACE("Write: %s - %s\n", file, command); + fh = open(file, O_WRONLY); + if (fh == -1) { + PRINT_PERR("Could not open %s\n", file); + return -1; + } + if (write(fh, command, strlen(command)) == -1) { + PRINT_PERR("Write to %s failed\n", file); + close(fh); + return -1; + }; + close(fh); + return 0; +} + +static int read_file(const char *file, char *buf, int size) +{ + ssize_t count; + int fh; + + PRINT_TRACE("Read: %s:\n", file); + fh = open(file, O_RDONLY); + if (fh == -1) { + PRINT_PERR("open %s failed\n", file); + return -1; + } + count = read(fh, buf, size - 1); + if (count < 0) { + PRINT_PERR("read %s failed\n", file); + close(fh); + return -1; + } + buf[count] = 0; + if (buf[strlen(buf) - 1] == '\n') + buf[strlen(buf) - 1] = 0; /* strip newline */ + close(fh); + PRINT_TRACE("'%s'\n", buf); + + return 0; +} + +/* + * Get HSA size + */ +static __u64 get_hsa_size(void) +{ + char buf[128]; + + if (read_file(DEV_ZCORE_HSA, buf, sizeof(buf))) + return 0; + return strtoul(buf, NULL, 16); +} + +/* + * Release HSA + */ +static void release_hsa(void) +{ + write_to_file(DEV_ZCORE_HSA, "0"); +} + +/* + * Enable the scsi disk for dumping + * Return: 0 - ok + * != 0 - error + */ +static int enable_zfcp_device(void) +{ + char command[1024], file[1024]; + struct stat s; + + /* device */ + if (read_file(IPL_DEVNO, g.dump_devno, sizeof(g.dump_devno))) + return -1; + sprintf(file, "/sys/bus/ccw/drivers/zfcp/%s/online", g.dump_devno); + if (write_to_file(file, "1\n")) + return -1; + + /* wwpn */ + if (read_file(IPL_WWPN, g.dump_wwpn, sizeof(g.dump_wwpn))) + return -1; + sprintf(file, "/sys/bus/ccw/drivers/zfcp/%s/port_add", g.dump_devno); + /* The port_add attribute has been removed in recent kernels */ + if (stat(file, &s) == 0) { + sprintf(command, "%s\n", g.dump_wwpn); + if (write_to_file(file, command)) + return -1; + } + + /* lun */ + if (read_file(IPL_LUN, g.dump_lun, sizeof(g.dump_lun))) + return -1; + sprintf(file, "/sys/bus/ccw/drivers/zfcp/%s/%s/unit_add", g.dump_devno, + g.dump_wwpn); + sprintf(command, "%s\n", g.dump_lun); + if (write_to_file(file, command)) + return -1; + + /* bootprog */ + read_file("/sys/firmware/ipl/bootprog", g.dump_bootprog, + sizeof(g.dump_bootprog)); + + return 0; +} + +/* + * Mount the dump device + * Return: 0 - ok + * != 0 - error + */ +static int mount_dump_device(void) +{ + int pid; + char dump_part[16]; + + PRINT_TRACE("e2fsck\n"); + sprintf(dump_part, "%s%i", DEV_SCSI, atoi(g.parm_part)); + + pid = fork(); + if (pid < 0) { + PRINT_PERR("fork failed\n"); + return -1; + } else if (pid == 0) { + execl("/bin/e2fsck", "e2fsck", dump_part, "-y", NULL); + execl("/sbin/e2fsck", "e2fsck", dump_part, "-y", NULL); + exit(1); + } else { + waitpid(pid, NULL, 0); + } + + PRINT_TRACE("mount\n"); + if (mount(dump_part, DUMP_DIR, "ext4", 0, NULL) == 0) + return 0; + if (mount(dump_part, DUMP_DIR, "ext3", 0, NULL) == 0) + return 0; + if (mount(dump_part, DUMP_DIR, "ext2", 0, NULL) != 0) { + PRINT_PERR("mount failed\n"); + return -1; + } + return 0; +} + +/* + * unmount the dump device + * Return: 0 - ok + * != 0 - error + */ +static int umount_dump_device(void) +{ + if (umount(DUMP_DIR) != 0) { + PRINT_PERR("umount failed\n"); + return -1; + } + return 0; +} + +/* + * Terminate the system dumper + */ +static void terminate(void) +{ + int fd; + + sleep(WAIT_TIME_END); /* give the messages time to be displayed */ + fd = open(DEV_ZCORE_REIPL, O_WRONLY, 0); + if (fd == -1) + goto no_reipl; + write(fd, REIPL, 1); + close(fd); +no_reipl: + reboot(LINUX_REBOOT_CMD_POWER_OFF); +} + +/* + * Signal handler for zfcp_dumper + */ +static __sighandler_t dump_sig_handler(int sig, siginfo_t *sip, void*p) +{ + PRINT_ERR("Got signal: %i\n", sig); + PRINT_ERR("Dump failed!\n"); + terminate(); + return 0; +} + +/* + * Setup the Signal handler for zfcp_dumper + * Return: 0 - ok + * !=0 - error + */ +static int init_sig(void) +{ + g.sigact.sa_flags = (SA_NODEFER | SA_SIGINFO | SA_RESETHAND); + g.sigact.sa_handler = (__sighandler_t)dump_sig_handler; + if (sigemptyset(&g.sigact.sa_mask) < 0) + return -1; + if (sigaction(SIGINT, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGTERM, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGPIPE, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGABRT, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGSEGV, &g.sigact, NULL) < 0) + return -1; + if (sigaction(SIGBUS, &g.sigact, NULL) < 0) + return -1; + + return 0; +} + +/* + * Set memory management parameters: Ensure that dirty pages are written + * early enough! See "Documentation/filesystems/proc.txt" + * Return: 0 - ok + * !=0 - error + */ +static int tune_vm(void) +{ + char *sysctl_names[] = {"/proc/sys/vm/dirty_ratio", + "/proc/sys/vm/dirty_background_ratio", + "/proc/sys/vm/dirty_writeback_centisecs", + "/proc/sys/vm/dirty_expire_centisecs", + "/proc/sys/vm/vfs_cache_pressure", + "/proc/sys/vm/lowmem_reserve_ratio", + NULL}; + char *sysctl_values[] = {"2", "5", "50", "50", "500", "32", NULL}; + int i; + + i = 0; + while (sysctl_names[i]) { + if (write_to_file(sysctl_names[i], sysctl_values[i])) + return -1; + i++; + } + return 0; +} + +/* + * Get dump number of either new dump or dump to erase + * Parameter: dumpdir - dump directory (absolute path) + * mode - DUMP_FIRST: Find smallest dump number in directory + * - DUMP_LAST: Find highest dump number in directory + * Return: >= 0 - dump number + * -1 - no dump found in directory + * <-1 - error + */ +static int get_dump_num(const char *dumpdir, int mode) +{ + DIR *dir = NULL; + struct dirent *dir_ent; + int dump_found, rc; + + rc = 0; + dump_found = 0; + dir = opendir(dumpdir); + if (!dir) { + PRINT_PERR("Cannot evalute dump number\n"); + return -2; + } + + while ((dir_ent = readdir(dir))) { + int num; + if (sscanf(dir_ent->d_name, "dump.%ui", &num) == 1) { + char suffix[1024] = {}; + + /* + * check if we have something like dump.001 + * this is not treated as dump, since we do not allow + * leading zeros. + * Also files like dump.-1, dump.-10 are ignored. + */ + sscanf(dir_ent->d_name, "dump.%s", suffix); + if (suffix[0] == '-') + continue; + if ((suffix[0] == '0') && isdigit(suffix[1])) + continue; + if (!dump_found) { + dump_found = 1; + rc = num; + } else if (mode == DUMP_LAST) { + rc = MAX(num, rc); + } else if (mode == DUMP_FIRST) { + rc = MIN(num, rc); + } + } + } + if (!dump_found) + rc = NO_DUMP; + closedir(dir); + + return rc; +} + +/* + * Erase oldest dump in dump directory + * Return: 0 - ok + * !=0 - error + */ +static int erase_oldest_dump(void) +{ + int dump_nr; + char dname[1024] = {}; + char answ[1024] = {}; + + dump_nr = get_dump_num(g.dump_dir, DUMP_FIRST); + if (dump_nr < 0) { + PRINT_ERR("Internal error: dump number cannot be evaluated\n"); + return -1; + } + sprintf(dname, "dump.%i", dump_nr); + if (dump_nr == g.dump_nr) { + PRINT_ERR("Sorry, cannot delete any more dumps!\n"); + return -1; + } + if (g.parm_mode == PARM_MODE_AUTO_NUM) { + PRINT("Removing oldest dump: '%s'\n", dname); + } else { + while ((strcmp(answ, "y") != 0) && (strcmp(answ, "n") != 0)) { + PRINT("Remove oldest dump: '%s' (y/n)? ", dname); + scanf("%s", answ); + } + if (strcmp(answ, "n") == 0) + return -1; + } + sprintf(dname, "%s/dump.%i", g.dump_dir, dump_nr); + if (unlink(dname) == -1) { + PRINT_PERR("Could not remove dump\n"); + return -1; + } + sync(); + /* + * Wait some seconds in order to give ext3 time to discover that file + * has been removed. + */ + sleep(WAIT_TIME_ERASE); + PRINT("Dump removed!\n"); + return 0; +} + +/* + * write buffer to dump. In case of ENOSPC try to remove oldest dump + * Parameter: fd - filedescriptor of dump file + * buf - buffer to write + * count - nr of bytes to write + * + * Return: size - written bytes + * <0 - error + */ +static ssize_t dump_write(int fd, const void *buf, size_t count) +{ + ssize_t written, rc; + + written = 0; + while (written != count) { + rc = write(fd, buf + written, count - written); + if ((rc == -1) && (errno == ENOSPC)) { + PRINT_ERR("No space left on device!\n"); + /* Try to erase old dump */ + if (erase_oldest_dump()) + return -1; + continue; + } else if (rc == -1) { + /* Write failed somehow */ + return -1; + } + written += rc; + } + return written; +} + +#ifdef GZIP_SUPPORT +/* + * Wrapper to gzip compress routine + * Parameter: old - buffer to compress (in) + * old_size - size of old buffer in bytes (in) + * new - buffer for compressed data (out) + * new_size - size of 'new' buffer in bytes (in) + * Return: >=0 - Size of compressed buffer + * < 0 - error + */ +static int compress_gzip(const unsigned char *old, __u32 old_size, + unsigned char *new, __u32 new_size) +{ + int rc; + unsigned long len; + + len = old_size; + rc = compress(new, &len, old, new_size); + switch (rc) { + case Z_OK: + return len; + case Z_MEM_ERROR: + PRINT_ERR("Z_MEM_ERROR (not enough memory)!\n"); + return -1; + case Z_BUF_ERROR: + /* In this case the compressed output is bigger than + the uncompressed */ + return -1; + case Z_DATA_ERROR: + PRINT_ERR("Z_DATA_ERROR (input data corrupted)!\n"); + return -1; + default: + PRINT_ERR("Z_UNKNOWN_ERROR (rc 0x%x unknown)!\n", rc); + return -1; + } +} +#endif + +/* + * Do nothing! - No compression + */ +static int compress_none(const unsigned char *old, __u32 old_size, + unsigned char *new, __u32 new_size) +{ + return -1; /* "-1" indicates, that compression was not done */ +} + +/* + * Convert s390 standalone dump header to lkcd dump header + * Parameter: s390_dh - s390 dump header (in) + * dh - lkcd dump header (out) + */ +static void s390_to_lkcd_hdr(struct dump_hdr_s390 *s390_dh, + struct dump_hdr_lkcd *dh) +{ + struct timeval h_time; + + /* adjust todclock to 1970 */ + __u64 tod = s390_dh->tod; + tod -= 0x8126d60e46000000LL - (0x3c26700LL * 1000000 * 4096); + tod >>= 12; + h_time.tv_sec = tod / 1000000; + h_time.tv_usec = tod % 1000000; + + dh->memory_size = s390_dh->memory_size; + dh->memory_start = s390_dh->memory_start; + dh->memory_end = s390_dh->memory_end; + dh->num_dump_pages = s390_dh->num_pages; + dh->page_size = s390_dh->page_size; + dh->dump_level = s390_dh->dump_level; + + sprintf(dh->panic_string, "zSeries-dump (CPUID = %16llx)", + (unsigned long long) s390_dh->cpu_id); + + if (s390_dh->arch_id == DH_ARCH_ID_S390) + strcpy(dh->utsname_machine, "s390"); + else if (s390_dh->arch_id == DH_ARCH_ID_S390X) + strcpy(dh->utsname_machine, "s390x"); + else + strcpy(dh->utsname_machine, ""); + + strcpy(dh->utsname_sysname, ""); + strcpy(dh->utsname_nodename, ""); + strcpy(dh->utsname_release, ""); + strcpy(dh->utsname_version, ""); + strcpy(dh->utsname_domainname, ""); + + dh->magic_number = DUMP_MAGIC_NUMBER; + dh->version = DUMP_VERSION_NUMBER; + dh->header_size = sizeof(struct dump_hdr_lkcd); + dh->time.tv_sec = h_time.tv_sec; + dh->time.tv_usec = h_time.tv_usec; +} + +/* + * Convert s390 standalone dump header to lkcd asm dump header + * Parameter: s390_dh - s390 dump header (in) + * dh_asm - lkcd asm dump header (out) + */ +static void s390_to_lkcd_hdr_asm(struct dump_hdr_s390 *s390_dh, + struct dump_hdr_lkcd_asm *dh_asm) +{ + unsigned int i; + + dh_asm->magic_number = DUMP_MAGIC_NUMBER_ASM; + dh_asm->version = 0; + dh_asm->hdr_size = sizeof(*dh_asm); + dh_asm->cpu_cnt = s390_dh->cpu_cnt; + dh_asm->real_cpu_cnt = s390_dh->real_cpu_cnt; + for (i = 0; i < dh_asm->cpu_cnt; i++) + dh_asm->lc_vec[i] = s390_dh->lc_vec[i]; +} + +/* + * Write progress information to screen + * Parameter: written - So many bytes have been written to the dump + * max - This is the whole memory to be written + */ +static void show_progress(unsigned long long written, unsigned long long max) +{ + int time; + struct timeval t; + double percent; + + gettimeofday(&t, NULL); + time = t.tv_sec; + if ((time < g.last_progress) && (written != max) && (written != 0)) + return; + g.last_progress = time + 10; + percent = ((double) written / (double) max) * 100.0; + PRINT(" %4lli MB of %4lli MB (%5.1f%% )\n", written >> 20, max >> 20, + percent); + fflush(stdout); +} + +/* + * create dump + * + * Return: 0 - ok + * !=0 - error + */ +static int create_dump(void) +{ + struct stat stat_buf; + struct dump_hdr_lkcd dh; + struct dump_hdr_lkcd_asm dh_asm; + struct dump_hdr_s390 s390_dh; + compress_fn_t compress_fn; + struct dump_page dp; + char buf[PAGE_SIZE], dpcpage[PAGE_SIZE]; + char dump_name[1024]; + __u64 mem_loc, mem_count, hsa_size; + __u32 buf_loc = 0, dp_size, dp_flags; + int size, fin, fout, fmap, rc = 0; + char c_info[CHUNK_INFO_SIZE]; + struct mem_chunk *chunk, *chunk_first = NULL, *chunk_prev = NULL; + char *end_ptr; + void *page_buf; + + if (stat(g.dump_dir, &stat_buf) < 0) { + PRINT_ERR("Specified dump dir '%s' not found!\n", g.dump_dir); + return -1; + } else if (!S_ISDIR(stat_buf.st_mode)) { + PRINT_ERR("Specified dump dir '%s' is not a directory!\n", + g.dump_dir); + return -1; + } + + /* Allocate buffer for writing */ + if (posix_memalign(&page_buf, PAGE_SIZE, DUMP_BUF_SIZE)) { + PRINT_ERR("Out of memory: Could not allocate dump buffer\n"); + return -1; + } + + /* initialize progress time */ + g.last_progress = 0; + + /* get dump number */ + g.dump_nr = get_dump_num(g.dump_dir, DUMP_LAST); + if (g.dump_nr == NO_DUMP) + g.dump_nr = 0; + else if (g.dump_nr >= 0) + g.dump_nr += 1; + else + return -1; + + /* Open the memory map file - only available with kernel 2.6.25 or + * higher. If open fails, memory holes cannot be detected and only + * one single memory chunk is assumed */ + fmap = open(DEV_ZCORE_MAP, O_RDONLY, 0); + if (fmap == -1) { + chunk_first = calloc(1, sizeof(struct mem_chunk)); + if (chunk_first == NULL) { + PRINT_ERR("Could not allocate %d bytes of memory\n", + (int) sizeof(struct mem_chunk)); + return -1; + } + chunk_first->size = PARM_MEM_DFLT; + } else { + /* read information about memory chunks (start address and size) */ + do { + if (read(fmap, c_info, sizeof(c_info)) != sizeof(c_info)) { + PRINT_ERR("read() memory map file '%s' " + "failed!\n", DEV_ZCORE_MAP); + rc = -1; + goto failed_close_fmap; + } + chunk = calloc(1, sizeof(struct mem_chunk)); + if (chunk == NULL) { + PRINT_ERR("Could not allocate %d bytes of " + "memory\n", + (int) sizeof(struct mem_chunk)); + rc = -1; + goto failed_free_chunks; + } + chunk->size = strtoul(c_info + 17, &end_ptr, 16); + if (end_ptr != c_info + 33 || *end_ptr != ' ') { + PRINT_ERR("Invalid contents of memory map " + "file '%s'!\n", DEV_ZCORE_MAP); + rc = -1; + goto failed_free_chunks; + } + if (chunk->size == 0) + break; + chunk->addr = strtoul(c_info, &end_ptr, 16); + if (end_ptr != c_info + 16 || *end_ptr != ' ') { + PRINT_ERR("Invalid contents of memory map " + "file '%s'!\n", DEV_ZCORE_MAP); + rc = -1; + goto failed_free_chunks; + } + if (!chunk_first) + chunk_first = chunk; + else + chunk_prev->next = chunk; + chunk_prev = chunk; + } while (1); + } + + hsa_size = get_hsa_size(); + PRINT_TRACE("hsa size: %llx\n", (unsigned long long) hsa_size); + + /* try to open the source device */ + fin = open(DEV_ZCORE, O_RDONLY, 0); + if (fin == -1) { + PRINT_ERR("open() source device '%s' failed!\n", DEV_ZCORE); + rc = -1; + goto failed_free_chunks; + } + + /* make the new filename */ + sprintf(dump_name, "%s/dump.%d", g.dump_dir, g.dump_nr); + fout = open(dump_name, DUMP_FLAGS, DUMP_MODE); + if (fout == -1) { + PRINT_ERR("open() of dump file \"%s\" failed!\n", dump_name); + rc = -1; + goto failed_close_fin; + } + + PRINT("dump file: dump.%d\n", g.dump_nr); + memset(&dh, 0, sizeof(dh)); + + /* get the dump header */ + if (lseek(fin, 0, SEEK_SET) < 0) { + PRINT_ERR("Cannot lseek() to get the dump header from the " + "dump file!\n"); + rc = -1; + goto failed_close_fout; + } + if (read(fin, &s390_dh, sizeof(s390_dh)) != sizeof(s390_dh)) { + PRINT_ERR("Cannot read() dump header from dump file!\n"); + rc = -1; + goto failed_close_fout; + } + + s390_to_lkcd_hdr(&s390_dh, &dh); + if (s390_dh.version >= 5) + s390_to_lkcd_hdr_asm(&s390_dh, &dh_asm); + + if (strcmp(g.parm_compress, PARM_COMP_GZIP) == 0) { +#ifdef GZIP_SUPPORT + dh.dump_compress = DUMP_COMPRESS_GZIP; + compress_fn = compress_gzip; +#else + PRINT_WARN("No gzip support. Compression disabled!\n"); + dh.dump_compress = DUMP_COMPRESS_NONE; + compress_fn = compress_none; +#endif + } else { + dh.dump_compress = DUMP_COMPRESS_NONE; + compress_fn = compress_none; + } + + if (g.parm_mem < dh.memory_size) { + /* dump_mem parameter specified: Adjust memory size */ + dh.memory_size = g.parm_mem; + dh.memory_end = g.parm_mem; + dh.num_dump_pages = g.parm_mem / dh.page_size; + } + + memset(page_buf, 0, PAGE_SIZE); + memcpy(page_buf, &dh, sizeof(dh)); + memcpy(page_buf + sizeof(dh), &dh_asm, sizeof(dh_asm)); + if (lseek(fout, 0L, SEEK_SET) < 0) { + PRINT_ERR("lseek() failed\n"); + rc = -1; + goto failed_close_fout; + } + if (dump_write(fout, page_buf, PAGE_SIZE) != PAGE_SIZE) { + PRINT_ERR("Error: Write dump header failed\n"); + rc = -1; + goto failed_close_fout; + } + if (lseek(fout, LKCD_HDR_SIZE, SEEK_SET) < 0) { + PRINT_ERR("lseek() failed\n"); + rc = -1; + goto failed_close_fout; + } + + /* write dump */ + + chunk = chunk_first; + mem_loc = 0; + mem_count = 0; + if (lseek(fin, DUMP_HEADER_SZ_S390SA, SEEK_SET) < 0) { + PRINT_ERR("lseek() failed\n"); + rc = -1; + goto failed_close_fout; + } + while (mem_loc < dh.memory_end) { + if (mem_loc >= chunk->addr + chunk->size) { + chunk = chunk->next; + mem_loc = chunk->addr; + if (lseek(fin, DUMP_HEADER_SZ_S390SA + mem_loc, + SEEK_SET) < 0) { + PRINT_ERR("lseek() failed\n"); + rc = -1; + goto failed_close_fout; + } + } + if (hsa_size && mem_loc >= hsa_size) { + release_hsa(); + hsa_size = 0; + } + if (read(fin, buf, PAGE_SIZE) != PAGE_SIZE) { + if (errno == EFAULT) { + /* probably memory hole. Skip page */ + mem_loc += PAGE_SIZE; + continue; + } + PRINT_PERR("read error\n"); + rc = -1; + goto failed_close_fout; + } + memset(dpcpage, 0, PAGE_SIZE); + /* get the new compressed page size */ + + size = compress_fn((unsigned char *)buf, PAGE_SIZE, + (unsigned char *)dpcpage, PAGE_SIZE); + + /* if compression failed or compressed was ineffective, + * we write an uncompressed page */ + if (size < 0) { + dp_flags = DUMP_DH_RAW; + dp_size = PAGE_SIZE; + } else { + dp_flags = DUMP_DH_COMPRESSED; + dp_size = size; + } + dp.address = mem_loc; + dp.size = dp_size; + dp.flags = dp_flags; + memcpy(page_buf + buf_loc, &dp, sizeof(dp)); + buf_loc += sizeof(struct dump_page); + /* copy the page of memory */ + if (dp_flags & DUMP_DH_COMPRESSED) + /* copy the compressed page */ + memcpy(page_buf + buf_loc, dpcpage, dp_size); + else + /* copy directly from memory */ + memcpy(page_buf + buf_loc, buf, dp_size); + buf_loc += dp_size; + mem_loc += PAGE_SIZE; + mem_count += PAGE_SIZE; + if (buf_loc + PAGE_SIZE + sizeof(dp) > DUMP_BUF_SIZE) { + unsigned long rem = buf_loc % PAGE_SIZE; + long size = buf_loc - rem; + + if (dump_write(fout, page_buf, size) != size) { + PRINT_ERR("write error\n"); + rc = -1; + goto failed_close_fout; + } + memmove(page_buf, page_buf + size, rem); + buf_loc = rem; + } + show_progress(mem_count, dh.memory_size); + } + + /* write end marker */ + + dp.address = 0x0; + dp.size = 0x0; + dp.flags = DUMP_DH_END; + memcpy(page_buf + buf_loc, &dp, sizeof(dp)); + buf_loc += sizeof(dp); + dump_write(fout, page_buf, buf_loc + (PAGE_SIZE - buf_loc % PAGE_SIZE)); + if (ftruncate(fout, lseek(fout, 0, SEEK_CUR) - + (PAGE_SIZE - buf_loc % PAGE_SIZE)) == -1) { + PRINT_ERR("truncate error\n"); + rc = -1; + goto failed_close_fout; + } + dump_write(fout, &dp, sizeof(dp)); + +failed_close_fout: + close(fout); +failed_close_fin: + close(fin); +failed_free_chunks: + chunk = chunk_first; + while (chunk) { + chunk_prev = chunk; + chunk = chunk->next; + free(chunk_prev); + } +failed_close_fmap: + close(fmap); + return rc; +} + +/* + * Load a kernel module + */ +static void modprobe(const char *module) +{ + pid_t pid; + + pid = fork(); + if (pid < 0) { + PRINT_PERR("fork failed\n"); + return; + } else if (pid == 0) { + execl("/bin/modprobe", "modprobe", module, "-q", NULL); + execl("/sbin/modprobe", "modprobe", module, "-q", NULL); + exit(1); + } else { + waitpid(pid, NULL, 0); + } +} + +/* + * Load all required kernel modules + */ +static void load_modules(void) +{ + int i; + + for (i = 0; module_list[i]; i++) + modprobe(module_list[i]); +} + +/* + * main routine of the zfcp_dumper + */ +int main(int argc, char *argv[]) +{ + char linux_version[128]; + +#ifdef __s390x__ + PRINT("Linux System Dumper starting\n"); + PRINT("Version %s (64 bit)\n", ZFCPDUMP_VERSION); +#else + PRINT("Linux System Dumper starting\n"); + PRINT("Version %s (32 bit)\n", ZFCPDUMP_VERSION); +#endif + + if (init_sig()) { + PRINT_ERR("Init Signals failed!\n"); + goto fail; + } + if (mount("proc", "/proc", "proc", 0, NULL)) { + if (errno != EBUSY) { + PRINT_PERR("Unable to mount proc\n"); + goto fail; + } + } + read_file("/proc/version", linux_version, sizeof(linux_version)); + PRINT("%s\n", linux_version); + + if (mount("sysfs", "/sys", "sysfs", 0, NULL)) { + if (errno != EBUSY) { + PRINT_PERR("Unable to mount sysfs\n"); + goto fail; + } + } + if (mount("debugfs", "/sys/kernel/debug", "debugfs", 0, NULL)) { + if (errno != EBUSY) { + PRINT_PERR("Unable to mount debugfs\n"); + goto fail; + } + } + if (tune_vm()) { + PRINT_PERR("Unable to set VM settings\n"); + goto fail; + } + if (parse_parmline()) { + PRINT_ERR("Could not parse parmline\n"); + goto fail; + } + load_modules(); + if (enable_zfcp_device()) { + PRINT_ERR("Could not enable dump device\n"); + goto fail; + } + PRINT(" \n"); /* leading blank is needed that sclp console prints + the newline */ + PRINT("DUMP PARAMETERS:\n"); + PRINT("================\n"); + PRINT("devno : %s\n", g.dump_devno); + PRINT("wwpn : %s\n", g.dump_wwpn); + PRINT("lun : %s\n", g.dump_lun); + PRINT("conf : %s\n", g.dump_bootprog); + PRINT("partition: %s\n", g.parm_part); + PRINT("directory: %s\n", g.parm_dir); + PRINT("compress : %s\n", g.parm_compress); + PRINT(" \n"); + PRINT("MOUNT DUMP PARTITION:\n"); + PRINT("=====================\n"); + sleep(WAIT_TIME_ONLINE); + if (mount_dump_device()) { + PRINT_ERR("Could not mount dump device\n"); + goto fail; + } + PRINT("DONE.\n"); + PRINT(" \n"); + PRINT("DUMP PROCESS STARTED:\n"); + PRINT("=====================\n"); + + if (create_dump()) + goto fail_umount; + + if (umount_dump_device()) { + PRINT_ERR("Could not umount dump device\n"); + goto fail; + } + PRINT(" \n"); + PRINT("DUMP 'dump.%i' COMPLETE\n", g.dump_nr); + fflush(stdout); + terminate(); + return 0; + +fail_umount: + if (umount_dump_device()) { + PRINT_ERR("Could not umount dump device\n"); + goto fail; + } +fail: + PRINT("DUMP 'dump.%i' FAILED\n", g.dump_nr); + fflush(stdout); + terminate(); + return 1; +} diff -uprN linux-2.6.32/arch/s390/boot/zfcpdump.h linux-2.6.32.ovz/arch/s390/boot/zfcpdump.h --- linux-2.6.32/arch/s390/boot/zfcpdump.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/boot/zfcpdump.h 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,251 @@ +/* + * zfcpdump userspace tool + * + * Copyright IBM Corp. 2003, 2007. + * Author(s): Michael Holzheu + */ + +#ifndef _ZFCPDUMP_H +#define _ZFCPDUMP_H + +#include +#include +#include + +#define ZFCPDUMP_VERSION "2.2" + +#define PRINT_TRACE(x...) \ + do { \ + if (g.parm_debug >= 3) { \ + fprintf(stderr, "TRACE: "); \ + fprintf(stderr, ##x); \ + } \ + } while (0) + +#define PRINT_ERR(x...) \ + do { \ + fprintf(stderr, "ERROR: "); \ + fprintf(stderr, ##x); \ + } while (0) + +#define PRINT_WARN(x...) \ + do { \ + fprintf(stderr, "WARNING: "); \ + fprintf(stderr, ##x); \ + } while (0) + +#define PRINT_PERR(x...) \ + do { \ + fprintf(stderr, "ERROR: "); \ + fprintf(stderr, ##x); \ + perror(""); \ + } while (0) + +#define PRINT(x...) fprintf(stdout, ##x) +#define CMDLINE_MAX_LEN 1024 +#define KERN_PARM_MAX 100 + +#define DUMP_FLAGS (O_CREAT | O_RDWR | O_TRUNC | O_DIRECT) +#define DUMP_MODE (S_IRUSR | S_IWUSR | S_IRGRP) + +struct globals { + char *parm_compress; + char *parm_dir; + char *parm_part; + int parm_debug; + int parm_mode; + __u64 parm_mem; + char parmline[CMDLINE_MAX_LEN]; + char dump_dir[1024]; + int dump_nr; + int last_progress; + struct sigaction sigact; + char dump_devno[16]; + char dump_wwpn[32]; + char dump_lun[32]; + char dump_bootprog[32]; +}; + +#ifndef MIN +#define MIN(x, y) ((x) < (y) ? (x) : (y)) +#endif +#ifndef MAX +#define MAX(x, y) ((x) > (y) ? (x) : (y)) +#endif + +#define PROC_CMDLINE "/proc/cmdline" +#define PROC_MISC "/proc/misc" +#define DEV_ZCORE "/sys/kernel/debug/zcore/mem" +#define DEV_ZCORE_MAP "/sys/kernel/debug/zcore/memmap" +#define DEV_ZCORE_REIPL "/sys/kernel/debug/zcore/reipl" +#define DEV_ZCORE_HSA "/sys/kernel/debug/zcore/hsa" +#define REIPL "1" +#define DEV_SCSI "/dev/sda" +#define DUMP_DIR "/mnt" + +#define IPL_WWPN "/sys/firmware/ipl/wwpn" +#define IPL_DEVNO "/sys/firmware/ipl/device" +#define IPL_LUN "/sys/firmware/ipl/lun" + +#define PARM_DIR "dump_dir" +#define PARM_DIR_DFLT "/" + +#define PARM_PART "dump_part" +#define PARM_PART_DFLT "1" + +#define PARM_COMP "dump_compress" +#define PARM_COMP_GZIP "gzip" +#define PARM_COMP_NONE "none" +#define PARM_COMP_DFLT PARM_COMP_NONE + +#define PARM_MEM "dump_mem" +#ifdef __s390x__ +#define PARM_MEM_DFLT 0xffffffffffffffff +#else +#define PARM_MEM_DFLT 0xffffffff +#endif + +#define PARM_DEBUG "dump_debug" +#define PARM_DEBUG_DFLT 2 +#define PARM_DEBUG_MIN 1 +#define PARM_DEBUG_MAX 6 + +#define PARM_MODE "dump_mode" +#define PARM_MODE_INTERACT "interactive" +#define PARM_MODE_INTERACT_NUM 0 +#define PARM_MODE_AUTO "auto" +#define PARM_MODE_AUTO_NUM 1 +#define PARM_MODE_DFLT PARM_MODE_INTERACT +#define PARM_MODE_NUM_DFLT PARM_MODE_INTERACT_NUM + +#define DUMP_FIRST 0 +#define DUMP_LAST 1 +#define NO_DUMP -1 + +#define WAIT_TIME_ERASE 5 /* seconds */ +#define WAIT_TIME_END 3 /* seconds */ +#define WAIT_TIME_ONLINE 2 /* seconds */ + +#define UTS_LEN 65 + +#define PAGE_SIZE 4096 +#define DUMP_BUF_SIZE (80 * PAGE_SIZE) +#define LKCD_HDR_SIZE (64 * 1024) + +/* header definitions for dumps from s390 standalone dump tools */ +#define DUMP_MAGIC_S390SA 0xa8190173618f23fdULL /* s390sa magic number */ +#define DUMP_HEADER_SZ_S390SA 4096 + +/* standard header definitions */ +#define DUMP_MAGIC_NUMBER 0xa8190173618f23edULL /* dump magic number */ +#define DUMP_MAGIC_NUMBER_ASM 0x733339302d64756dULL +#define DUMP_VERSION_NUMBER 0x8 /* dump version number */ +#define DUMP_PANIC_LEN 0x100 /* dump panic string length */ + +/* dump compression options -- add as necessary */ +#define DUMP_COMPRESS_NONE 0x0 /* don't compress this dump */ +#define DUMP_COMPRESS_GZIP 0x2 /* use GZIP compression */ + +/* dump header flags -- add as necessary */ +#define DUMP_DH_RAW 0x1 /* raw page (no compression) */ +#define DUMP_DH_COMPRESSED 0x2 /* page is compressed */ +#define DUMP_DH_END 0x4 /* end marker on a full dump */ + +#define CHUNK_INFO_SIZE 34 /* 2 16-byte char, each followed by blank */ + +/* + * This is the header dumped at the top of every valid crash dump. + */ +struct dump_hdr_lkcd { + __u64 magic_number; + __u32 version; + __u32 header_size; + __u32 dump_level; + __u32 page_size; + __u64 memory_size; + __u64 memory_start; + __u64 memory_end; + __u32 num_dump_pages; + char panic_string[DUMP_PANIC_LEN]; + struct { + __u64 tv_sec; + __u64 tv_usec; + } time; + char utsname_sysname[UTS_LEN]; + char utsname_nodename[UTS_LEN]; + char utsname_release[UTS_LEN]; + char utsname_version[UTS_LEN]; + char utsname_machine[UTS_LEN]; + char utsname_domainname[UTS_LEN]; + __u64 current_task; + __u32 dump_compress; + __u32 dump_flags; + __u32 dump_device; +} __attribute__((packed)); + +#define DH_ARCH_ID_S390X 2 +#define DH_ARCH_ID_S390 1 + +/* + * s390 LKCD asm header + */ +struct dump_hdr_lkcd_asm { + __u64 magic_number; + __u32 version; + __u32 hdr_size; + __u16 cpu_cnt; + __u16 real_cpu_cnt; + __u32 lc_vec[512]; +} __attribute__((packed)); + +/* + * This is the header used by zcore + */ +struct dump_hdr_s390 { + __u64 magic_number; + __u32 version; + __u32 header_size; + __u32 dump_level; + __u32 page_size; + __u64 memory_size; + __u64 memory_start; + __u64 memory_end; + __u32 num_pages; + __u32 pad; + __u64 tod; + __u64 cpu_id; + __u32 arch_id; + __u32 volnr; + __u32 build_arch; + __u64 rmem_size; + __u8 mvdump; + __u16 cpu_cnt; + __u16 real_cpu_cnt; + __u8 end_pad1[0x200-0x061]; + __u64 mvdump_sign; + __u64 mvdump_zipl_time; + __u8 end_pad2[0x800-0x210]; + __u32 lc_vec[512]; +} __attribute__((packed)); + +/* + * Header associated to each physical page of memory saved in the system + * crash dump. + */ +struct dump_page { + __u64 address; /* the address of this dump page */ + __u32 size; /* the size of this dump page */ + __u32 flags; /* flags (DUMP_COMPRESSED, DUMP_RAW or DUMP_END) */ +} __attribute__((packed)); + +struct mem_chunk { + __u64 addr; /* the start address of this memory chunk */ + __u64 size; /* the length of this memory chunk */ + struct mem_chunk *next; /* pointer to next memory chunk */ +}; + +/* Compression function */ +typedef int (*compress_fn_t)(const unsigned char *old, __u32 old_size, + unsigned char *new, __u32 size); + +#endif /* _ZFCPDUMP_H */ diff -uprN linux-2.6.32/arch/s390/boot/zfcpdump_initramfs.txt linux-2.6.32.ovz/arch/s390/boot/zfcpdump_initramfs.txt --- linux-2.6.32/arch/s390/boot/zfcpdump_initramfs.txt 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/boot/zfcpdump_initramfs.txt 2015-03-10 13:20:57.000000000 -0700 @@ -0,0 +1,27 @@ +# +# initramfs definition for zfcpdump +# See also Documentation/s390/zfcpdump.txt +# + +dir /dev 755 0 0 +nod /dev/console 644 0 0 c 5 1 +nod /dev/null 644 0 0 c 1 3 +nod /dev/sda1 644 0 0 b 8 1 +nod /dev/sda2 644 0 0 b 8 2 +nod /dev/sda3 644 0 0 b 8 3 +nod /dev/sda4 644 0 0 b 8 4 +nod /dev/sda5 644 0 0 b 8 5 +nod /dev/sda6 644 0 0 b 8 6 +nod /dev/sda7 644 0 0 b 8 7 +nod /dev/sda8 644 0 0 b 8 8 +nod /dev/sda9 644 0 0 b 8 9 +nod /dev/sda10 644 0 0 b 8 10 +nod /dev/sda11 644 0 0 b 8 11 +nod /dev/sda12 644 0 0 b 8 12 +nod /dev/sda13 644 0 0 b 8 13 +nod /dev/sda14 644 0 0 b 8 14 +nod /dev/sda15 644 0 0 b 8 15 +file /init arch/s390/boot/zfcpdump 755 0 0 +dir /proc 755 0 0 +dir /sys 755 0 0 +dir /mnt 755 0 0 diff -uprN linux-2.6.32/arch/s390/crypto/aes_s390.c linux-2.6.32.ovz/arch/s390/crypto/aes_s390.c --- linux-2.6.32/arch/s390/crypto/aes_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/aes_s390.c 2015-03-10 13:24:13.000000000 -0700 @@ -25,16 +25,18 @@ #include #include #include +#include #include "crypt_s390.h" #define AES_KEYLEN_128 1 #define AES_KEYLEN_192 2 #define AES_KEYLEN_256 4 -static char keylen_flag = 0; +static u8 *ctrblk; +static DEFINE_SPINLOCK(ctrblk_lock); +static char keylen_flag; struct s390_aes_ctx { - u8 iv[AES_BLOCK_SIZE]; u8 key[AES_MAX_KEY_SIZE]; long enc; long dec; @@ -45,6 +47,23 @@ struct s390_aes_ctx { } fallback; }; +struct pcc_param { + u8 key[32]; + u8 tweak[16]; + u8 block[16]; + u8 bit[16]; + u8 xts[16]; +}; + +struct s390_xts_ctx { + u8 key[32]; + u8 pcc_key[32]; + long enc; + long dec; + int key_len; + struct crypto_blkcipher *fallback; +}; + /* * Check if the key_len is supported by the HW. * Returns 0 if it is, a positive number if it is not and software fallback is @@ -307,7 +326,8 @@ static int ecb_aes_crypt(struct blkciphe u8 *in = walk->src.virt.addr; ret = crypt_s390_km(func, param, out, in, n); - BUG_ON((ret < 0) || (ret != n)); + if (ret < 0 || ret != n) + return -EIO; nbytes &= AES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, walk, nbytes); @@ -423,29 +443,36 @@ static int cbc_aes_set_key(struct crypto return aes_set_key(tfm, in_key, key_len); } -static int cbc_aes_crypt(struct blkcipher_desc *desc, long func, void *param, +static int cbc_aes_crypt(struct blkcipher_desc *desc, long func, struct blkcipher_walk *walk) { + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); int ret = blkcipher_walk_virt(desc, walk); unsigned int nbytes = walk->nbytes; + struct { + u8 iv[AES_BLOCK_SIZE]; + u8 key[AES_MAX_KEY_SIZE]; + } param; if (!nbytes) goto out; - memcpy(param, walk->iv, AES_BLOCK_SIZE); + memcpy(param.iv, walk->iv, AES_BLOCK_SIZE); + memcpy(param.key, sctx->key, sctx->key_len); do { /* only use complete blocks */ unsigned int n = nbytes & ~(AES_BLOCK_SIZE - 1); u8 *out = walk->dst.virt.addr; u8 *in = walk->src.virt.addr; - ret = crypt_s390_kmc(func, param, out, in, n); - BUG_ON((ret < 0) || (ret != n)); + ret = crypt_s390_kmc(func, ¶m, out, in, n); + if (ret < 0 || ret != n) + return -EIO; nbytes &= AES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, walk, nbytes); } while ((nbytes = walk->nbytes)); - memcpy(walk->iv, param, AES_BLOCK_SIZE); + memcpy(walk->iv, param.iv, AES_BLOCK_SIZE); out: return ret; @@ -462,7 +489,7 @@ static int cbc_aes_encrypt(struct blkcip return fallback_blk_enc(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_aes_crypt(desc, sctx->enc, sctx->iv, &walk); + return cbc_aes_crypt(desc, sctx->enc, &walk); } static int cbc_aes_decrypt(struct blkcipher_desc *desc, @@ -476,7 +503,7 @@ static int cbc_aes_decrypt(struct blkcip return fallback_blk_dec(desc, dst, src, nbytes); blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_aes_crypt(desc, sctx->dec, sctx->iv, &walk); + return cbc_aes_crypt(desc, sctx->dec, &walk); } static struct crypto_alg cbc_aes_alg = { @@ -504,15 +531,377 @@ static struct crypto_alg cbc_aes_alg = { } }; +static int xts_fallback_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int len) +{ + struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + unsigned int ret; + + xts_ctx->fallback->base.crt_flags &= ~CRYPTO_TFM_REQ_MASK; + xts_ctx->fallback->base.crt_flags |= (tfm->crt_flags & + CRYPTO_TFM_REQ_MASK); + + ret = crypto_blkcipher_setkey(xts_ctx->fallback, key, len); + if (ret) { + tfm->crt_flags &= ~CRYPTO_TFM_RES_MASK; + tfm->crt_flags |= (xts_ctx->fallback->base.crt_flags & + CRYPTO_TFM_RES_MASK); + } + return ret; +} + +static int xts_fallback_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_blkcipher *tfm; + unsigned int ret; + + tfm = desc->tfm; + desc->tfm = xts_ctx->fallback; + + ret = crypto_blkcipher_decrypt_iv(desc, dst, src, nbytes); + + desc->tfm = tfm; + return ret; +} + +static int xts_fallback_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct crypto_blkcipher *tfm; + unsigned int ret; + + tfm = desc->tfm; + desc->tfm = xts_ctx->fallback; + + ret = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes); + + desc->tfm = tfm; + return ret; +} + +static int xts_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + + switch (key_len) { + case 32: + xts_ctx->enc = KM_XTS_128_ENCRYPT; + xts_ctx->dec = KM_XTS_128_DECRYPT; + memcpy(xts_ctx->key + 16, in_key, 16); + memcpy(xts_ctx->pcc_key + 16, in_key + 16, 16); + break; + case 48: + xts_ctx->enc = 0; + xts_ctx->dec = 0; + xts_fallback_setkey(tfm, in_key, key_len); + break; + case 64: + xts_ctx->enc = KM_XTS_256_ENCRYPT; + xts_ctx->dec = KM_XTS_256_DECRYPT; + memcpy(xts_ctx->key, in_key, 32); + memcpy(xts_ctx->pcc_key, in_key + 32, 32); + break; + default: + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + xts_ctx->key_len = key_len; + return 0; +} + +static int xts_aes_crypt(struct blkcipher_desc *desc, long func, + struct s390_xts_ctx *xts_ctx, + struct blkcipher_walk *walk) +{ + unsigned int offset = (xts_ctx->key_len >> 1) & 0x10; + int ret = blkcipher_walk_virt(desc, walk); + unsigned int nbytes = walk->nbytes; + unsigned int n; + u8 *in, *out; + struct pcc_param pcc_param; + struct { + u8 key[32]; + u8 init[16]; + } xts_param; + + if (!nbytes) + goto out; + + memset(pcc_param.block, 0, sizeof(pcc_param.block)); + memset(pcc_param.bit, 0, sizeof(pcc_param.bit)); + memset(pcc_param.xts, 0, sizeof(pcc_param.xts)); + memcpy(pcc_param.tweak, walk->iv, sizeof(pcc_param.tweak)); + memcpy(pcc_param.key, xts_ctx->pcc_key, 32); + ret = crypt_s390_pcc(func, &pcc_param.key[offset]); + if (ret < 0) + return -EIO; + + memcpy(xts_param.key, xts_ctx->key, 32); + memcpy(xts_param.init, pcc_param.xts, 16); + do { + /* only use complete blocks */ + n = nbytes & ~(AES_BLOCK_SIZE - 1); + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + + ret = crypt_s390_km(func, &xts_param.key[offset], out, in, n); + if (ret < 0 || ret != n) + return -EIO; + + nbytes &= AES_BLOCK_SIZE - 1; + ret = blkcipher_walk_done(desc, walk, nbytes); + } while ((nbytes = walk->nbytes)); +out: + return ret; +} + +static int xts_aes_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + if (unlikely(xts_ctx->key_len == 48)) + return xts_fallback_encrypt(desc, dst, src, nbytes); + + blkcipher_walk_init(&walk, dst, src, nbytes); + return xts_aes_crypt(desc, xts_ctx->enc, xts_ctx, &walk); +} + +static int xts_aes_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_xts_ctx *xts_ctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + if (unlikely(xts_ctx->key_len == 48)) + return xts_fallback_decrypt(desc, dst, src, nbytes); + + blkcipher_walk_init(&walk, dst, src, nbytes); + return xts_aes_crypt(desc, xts_ctx->dec, xts_ctx, &walk); +} + +static int xts_fallback_init(struct crypto_tfm *tfm) +{ + const char *name = tfm->__crt_alg->cra_name; + struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + + xts_ctx->fallback = crypto_alloc_blkcipher(name, 0, + CRYPTO_ALG_ASYNC | CRYPTO_ALG_NEED_FALLBACK); + + if (IS_ERR(xts_ctx->fallback)) { + pr_err("Allocating XTS fallback algorithm %s failed\n", + name); + return PTR_ERR(xts_ctx->fallback); + } + return 0; +} + +static void xts_fallback_exit(struct crypto_tfm *tfm) +{ + struct s390_xts_ctx *xts_ctx = crypto_tfm_ctx(tfm); + + crypto_free_blkcipher(xts_ctx->fallback); + xts_ctx->fallback = NULL; +} + +static struct crypto_alg xts_aes_alg = { + .cra_name = "xts(aes)", + .cra_driver_name = "xts-aes-s390", + .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER | + CRYPTO_ALG_NEED_FALLBACK, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_xts_ctx), + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(xts_aes_alg.cra_list), + .cra_init = xts_fallback_init, + .cra_exit = xts_fallback_exit, + .cra_u = { + .blkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aes_set_key, + .encrypt = xts_aes_encrypt, + .decrypt = xts_aes_decrypt, + } + } +}; + +static int xts_aes_alg_reg; + +static int ctr_aes_set_key(struct crypto_tfm *tfm, const u8 *in_key, + unsigned int key_len) +{ + struct s390_aes_ctx *sctx = crypto_tfm_ctx(tfm); + + switch (key_len) { + case 16: + sctx->enc = KMCTR_AES_128_ENCRYPT; + sctx->dec = KMCTR_AES_128_DECRYPT; + break; + case 24: + sctx->enc = KMCTR_AES_192_ENCRYPT; + sctx->dec = KMCTR_AES_192_DECRYPT; + break; + case 32: + sctx->enc = KMCTR_AES_256_ENCRYPT; + sctx->dec = KMCTR_AES_256_DECRYPT; + break; + } + + return aes_set_key(tfm, in_key, key_len); +} + +static unsigned int __ctrblk_init(u8 *ctrptr, unsigned int nbytes) +{ + unsigned int i, n; + + /* only use complete blocks, max. PAGE_SIZE */ + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(AES_BLOCK_SIZE - 1); + for (i = AES_BLOCK_SIZE; i < n; i += AES_BLOCK_SIZE) { + memcpy(ctrptr + i, ctrptr + i - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(ctrptr + i, AES_BLOCK_SIZE); + } + return n; +} + +static int ctr_aes_crypt(struct blkcipher_desc *desc, long func, + struct s390_aes_ctx *sctx, struct blkcipher_walk *walk) +{ + int ret = blkcipher_walk_virt_block(desc, walk, AES_BLOCK_SIZE); + unsigned int n, nbytes; + u8 buf[AES_BLOCK_SIZE], ctrbuf[AES_BLOCK_SIZE]; + u8 *out, *in, *ctrptr = ctrbuf; + + if (!walk->nbytes) + return ret; + + if (spin_trylock(&ctrblk_lock)) + ctrptr = ctrblk; + + memcpy(ctrptr, walk->iv, AES_BLOCK_SIZE); + while ((nbytes = walk->nbytes) >= AES_BLOCK_SIZE) { + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + while (nbytes >= AES_BLOCK_SIZE) { + if (ctrptr == ctrblk) + n = __ctrblk_init(ctrptr, nbytes); + else + n = AES_BLOCK_SIZE; + ret = crypt_s390_kmctr(func, sctx->key, out, in, + n, ctrptr); + if (ret < 0 || ret != n) { + if (ctrptr == ctrblk) + spin_unlock(&ctrblk_lock); + return -EIO; + } + if (n > AES_BLOCK_SIZE) + memcpy(ctrptr, ctrptr + n - AES_BLOCK_SIZE, + AES_BLOCK_SIZE); + crypto_inc(ctrptr, AES_BLOCK_SIZE); + out += n; + in += n; + nbytes -= n; + } + ret = blkcipher_walk_done(desc, walk, nbytes); + } + if (ctrptr == ctrblk) { + if (nbytes) + memcpy(ctrbuf, ctrptr, AES_BLOCK_SIZE); + else + memcpy(walk->iv, ctrptr, AES_BLOCK_SIZE); + spin_unlock(&ctrblk_lock); + } else { + if (!nbytes) + memcpy(walk->iv, ctrptr, AES_BLOCK_SIZE); + } + /* + * final block may be < AES_BLOCK_SIZE, copy only nbytes + */ + if (nbytes) { + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + ret = crypt_s390_kmctr(func, sctx->key, buf, in, + AES_BLOCK_SIZE, ctrbuf); + if (ret < 0 || ret != AES_BLOCK_SIZE) + return -EIO; + memcpy(out, buf, nbytes); + crypto_inc(ctrbuf, AES_BLOCK_SIZE); + ret = blkcipher_walk_done(desc, walk, 0); + memcpy(walk->iv, ctrbuf, AES_BLOCK_SIZE); + } + + return ret; +} + +static int ctr_aes_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ctr_aes_crypt(desc, sctx->enc, sctx, &walk); +} + +static int ctr_aes_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) +{ + struct s390_aes_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ctr_aes_crypt(desc, sctx->dec, sctx, &walk); +} + +static struct crypto_alg ctr_aes_alg = { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-s390", + .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct s390_aes_ctx), + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ctr_aes_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ctr_aes_set_key, + .encrypt = ctr_aes_encrypt, + .decrypt = ctr_aes_decrypt, + } + } +}; + +static int ctr_aes_alg_reg; + static int __init aes_s390_init(void) { int ret; - if (crypt_s390_func_available(KM_AES_128_ENCRYPT)) + if (crypt_s390_func_available(KM_AES_128_ENCRYPT, CRYPT_S390_MSA)) keylen_flag |= AES_KEYLEN_128; - if (crypt_s390_func_available(KM_AES_192_ENCRYPT)) + if (crypt_s390_func_available(KM_AES_192_ENCRYPT, CRYPT_S390_MSA)) keylen_flag |= AES_KEYLEN_192; - if (crypt_s390_func_available(KM_AES_256_ENCRYPT)) + if (crypt_s390_func_available(KM_AES_256_ENCRYPT, CRYPT_S390_MSA)) keylen_flag |= AES_KEYLEN_256; if (!keylen_flag) @@ -535,9 +924,42 @@ static int __init aes_s390_init(void) if (ret) goto cbc_aes_err; + if (crypt_s390_func_available(KM_XTS_128_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4) && + crypt_s390_func_available(KM_XTS_256_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4)) { + ret = crypto_register_alg(&xts_aes_alg); + if (ret) + goto xts_aes_err; + xts_aes_alg_reg = 1; + } + + if (crypt_s390_func_available(KMCTR_AES_128_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4) && + crypt_s390_func_available(KMCTR_AES_192_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4) && + crypt_s390_func_available(KMCTR_AES_256_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4)) { + ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + if (!ctrblk) { + ret = -ENOMEM; + goto ctr_aes_err; + } + ret = crypto_register_alg(&ctr_aes_alg); + if (ret) { + free_page((unsigned long) ctrblk); + goto ctr_aes_err; + } + ctr_aes_alg_reg = 1; + } + out: return ret; +ctr_aes_err: + crypto_unregister_alg(&xts_aes_alg); +xts_aes_err: + crypto_unregister_alg(&cbc_aes_alg); cbc_aes_err: crypto_unregister_alg(&ecb_aes_alg); ecb_aes_err: @@ -548,6 +970,12 @@ aes_err: static void __exit aes_s390_fini(void) { + if (ctr_aes_alg_reg) { + crypto_unregister_alg(&ctr_aes_alg); + free_page((unsigned long) ctrblk); + } + if (xts_aes_alg_reg) + crypto_unregister_alg(&xts_aes_alg); crypto_unregister_alg(&cbc_aes_alg); crypto_unregister_alg(&ecb_aes_alg); crypto_unregister_alg(&aes_alg); diff -uprN linux-2.6.32/arch/s390/crypto/crypt_s390.h linux-2.6.32.ovz/arch/s390/crypto/crypt_s390.h --- linux-2.6.32/arch/s390/crypto/crypt_s390.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/crypt_s390.h 2015-03-10 13:22:27.000000000 -0700 @@ -24,13 +24,18 @@ #define CRYPT_S390_PRIORITY 300 #define CRYPT_S390_COMPOSITE_PRIORITY 400 +#define CRYPT_S390_MSA 0x1 +#define CRYPT_S390_MSA3 0x2 +#define CRYPT_S390_MSA4 0x4 + /* s390 cryptographic operations */ enum crypt_s390_operations { CRYPT_S390_KM = 0x0100, CRYPT_S390_KMC = 0x0200, CRYPT_S390_KIMD = 0x0300, CRYPT_S390_KLMD = 0x0400, - CRYPT_S390_KMAC = 0x0500 + CRYPT_S390_KMAC = 0x0500, + CRYPT_S390_KMCTR = 0x0600 }; /* @@ -51,6 +56,10 @@ enum crypt_s390_km_func { KM_AES_192_DECRYPT = CRYPT_S390_KM | 0x13 | 0x80, KM_AES_256_ENCRYPT = CRYPT_S390_KM | 0x14, KM_AES_256_DECRYPT = CRYPT_S390_KM | 0x14 | 0x80, + KM_XTS_128_ENCRYPT = CRYPT_S390_KM | 0x32, + KM_XTS_128_DECRYPT = CRYPT_S390_KM | 0x32 | 0x80, + KM_XTS_256_ENCRYPT = CRYPT_S390_KM | 0x34, + KM_XTS_256_DECRYPT = CRYPT_S390_KM | 0x34 | 0x80, }; /* @@ -75,6 +84,26 @@ enum crypt_s390_kmc_func { }; /* + * function codes for KMCTR (CIPHER MESSAGE WITH COUNTER) + * instruction + */ +enum crypt_s390_kmctr_func { + KMCTR_QUERY = CRYPT_S390_KMCTR | 0x0, + KMCTR_DEA_ENCRYPT = CRYPT_S390_KMCTR | 0x1, + KMCTR_DEA_DECRYPT = CRYPT_S390_KMCTR | 0x1 | 0x80, + KMCTR_TDEA_128_ENCRYPT = CRYPT_S390_KMCTR | 0x2, + KMCTR_TDEA_128_DECRYPT = CRYPT_S390_KMCTR | 0x2 | 0x80, + KMCTR_TDEA_192_ENCRYPT = CRYPT_S390_KMCTR | 0x3, + KMCTR_TDEA_192_DECRYPT = CRYPT_S390_KMCTR | 0x3 | 0x80, + KMCTR_AES_128_ENCRYPT = CRYPT_S390_KMCTR | 0x12, + KMCTR_AES_128_DECRYPT = CRYPT_S390_KMCTR | 0x12 | 0x80, + KMCTR_AES_192_ENCRYPT = CRYPT_S390_KMCTR | 0x13, + KMCTR_AES_192_DECRYPT = CRYPT_S390_KMCTR | 0x13 | 0x80, + KMCTR_AES_256_ENCRYPT = CRYPT_S390_KMCTR | 0x14, + KMCTR_AES_256_DECRYPT = CRYPT_S390_KMCTR | 0x14 | 0x80, +}; + +/* * function codes for KIMD (COMPUTE INTERMEDIATE MESSAGE DIGEST) * instruction */ @@ -83,6 +112,7 @@ enum crypt_s390_kimd_func { KIMD_SHA_1 = CRYPT_S390_KIMD | 1, KIMD_SHA_256 = CRYPT_S390_KIMD | 2, KIMD_SHA_512 = CRYPT_S390_KIMD | 3, + KIMD_GHASH = CRYPT_S390_KIMD | 65, }; /* @@ -284,6 +314,45 @@ static inline int crypt_s390_kmac(long f } /** + * crypt_s390_kmctr: + * @func: the function code passed to KMCTR; see crypt_s390_kmctr_func + * @param: address of parameter block; see POP for details on each func + * @dest: address of destination memory area + * @src: address of source memory area + * @src_len: length of src operand in bytes + * @counter: address of counter value + * + * Executes the KMCTR (CIPHER MESSAGE WITH COUNTER) operation of the CPU. + * + * Returns -1 for failure, 0 for the query func, number of processed + * bytes for encryption/decryption funcs + */ +static inline int crypt_s390_kmctr(long func, void *param, u8 *dest, + const u8 *src, long src_len, u8 *counter) +{ + register long __func asm("0") = func & CRYPT_S390_FUNC_MASK; + register void *__param asm("1") = param; + register const u8 *__src asm("2") = src; + register long __src_len asm("3") = src_len; + register u8 *__dest asm("4") = dest; + register u8 *__ctr asm("6") = counter; + int ret = -1; + + asm volatile( + "0: .insn rrf,0xb92d0000,%3,%1,%4,0 \n" /* KMCTR opcode */ + "1: brc 1,0b \n" /* handle partial completion */ + " la %0,0\n" + "2:\n" + EX_TABLE(0b,2b) EX_TABLE(1b,2b) + : "+d" (ret), "+a" (__src), "+d" (__src_len), "+a" (__dest), + "+a" (__ctr) + : "d" (__func), "a" (__param) : "cc", "memory"); + if (ret < 0) + return ret; + return (func & CRYPT_S390_FUNC_MASK) ? src_len - __src_len : __src_len; +} + +/** * crypt_s390_func_available: * @func: the function code of the specific function; 0 if op in general * @@ -291,13 +360,27 @@ static inline int crypt_s390_kmac(long f * * Returns 1 if func available; 0 if func or op in general not available */ -static inline int crypt_s390_func_available(int func) +static inline int crypt_s390_func_available(int func, + unsigned int facility_mask) { + unsigned long long facility_bits[2]; unsigned char status[16]; int ret; - /* check if CPACF facility (bit 17) is available */ - if (!(stfl() & 1ULL << (31 - 17))) + if (facility_mask & CRYPT_S390_MSA && + !(stfl() & 1ULL << (31 - 17))) + return 0; + + if (facility_mask & CRYPT_S390_MSA3 || + facility_mask & CRYPT_S390_MSA4) + if (stfle(facility_bits, 2) <= 0) + return 0; + + if (facility_mask & CRYPT_S390_MSA3 && + !(facility_bits[1] & (1ULL << (63 - (76 - 64))))) + return 0; + if (facility_mask & CRYPT_S390_MSA4 && + !(facility_bits[1] & (1ULL << (63 - (77 - 64))))) return 0; switch (func & CRYPT_S390_OP_MASK) { @@ -316,6 +399,10 @@ static inline int crypt_s390_func_availa case CRYPT_S390_KMAC: ret = crypt_s390_kmac(KMAC_QUERY, &status, NULL, 0); break; + case CRYPT_S390_KMCTR: + ret = crypt_s390_kmctr(KMCTR_QUERY, &status, NULL, NULL, 0, + NULL); + break; default: return 0; } @@ -326,4 +413,31 @@ static inline int crypt_s390_func_availa return (status[func >> 3] & (0x80 >> (func & 7))) != 0; } +/** + * crypt_s390_pcc: + * @func: the function code passed to KM; see crypt_s390_km_func + * @param: address of parameter block; see POP for details on each func + * + * Executes the PCC (PERFORM CRYPTOGRAPHIC COMPUTATION) operation of the CPU. + * + * Returns -1 for failure, 0 for success. + */ +static inline int crypt_s390_pcc(long func, void *param) +{ + register long __func asm("0") = func & 0x7f; /* encrypt or decrypt */ + register void *__param asm("1") = param; + int ret = -1; + + asm volatile( + "0: .insn rre,0xb92c0000,0,0 \n" /* PCC opcode */ + "1: brc 1,0b \n" /* handle partial completion */ + " la %0,0\n" + "2:\n" + EX_TABLE(0b,2b) EX_TABLE(1b,2b) + : "+d" (ret) + : "d" (__func), "a" (__param) : "cc", "memory"); + return ret; +} + + #endif /* _CRYPTO_ARCH_S390_CRYPT_S390_H */ diff -uprN linux-2.6.32/arch/s390/crypto/des_check_key.c linux-2.6.32.ovz/arch/s390/crypto/des_check_key.c --- linux-2.6.32/arch/s390/crypto/des_check_key.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/des_check_key.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,132 +0,0 @@ -/* - * Cryptographic API. - * - * Function for checking keys for the DES and Tripple DES Encryption - * algorithms. - * - * Originally released as descore by Dana L. How . - * Modified by Raimar Falke for the Linux-Kernel. - * Derived from Cryptoapi and Nettle implementations, adapted for in-place - * scatterlist interface. Changed LGPL to GPL per section 3 of the LGPL. - * - * s390 Version: - * Copyright IBM Corp. 2003 - * Author(s): Thomas Spatzier - * Jan Glauber (jan.glauber@de.ibm.com) - * - * Derived from "crypto/des.c" - * Copyright (c) 1992 Dana L. How. - * Copyright (c) Raimar Falke - * Copyright (c) Gisle Sflensminde - * Copyright (C) 2001 Niels Mvller. - * Copyright (c) 2002 James Morris - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - */ -#include -#include -#include -#include -#include "crypto_des.h" - -#define ROR(d,c,o) ((d) = (d) >> (c) | (d) << (o)) - -static const u8 parity[] = { - 8,1,0,8,0,8,8,0,0,8,8,0,8,0,2,8,0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,3, - 0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0,8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8, - 0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0,8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8, - 8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8,0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0, - 0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0,8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8, - 8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8,0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0, - 8,0,0,8,0,8,8,0,0,8,8,0,8,0,0,8,0,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0, - 4,8,8,0,8,0,0,8,8,0,0,8,0,8,8,0,8,5,0,8,0,8,8,0,0,8,8,0,8,0,6,8, -}; - -/* - * RFC2451: Weak key checks SHOULD be performed. - */ -int -crypto_des_check_key(const u8 *key, unsigned int keylen, u32 *flags) -{ - u32 n, w; - - n = parity[key[0]]; n <<= 4; - n |= parity[key[1]]; n <<= 4; - n |= parity[key[2]]; n <<= 4; - n |= parity[key[3]]; n <<= 4; - n |= parity[key[4]]; n <<= 4; - n |= parity[key[5]]; n <<= 4; - n |= parity[key[6]]; n <<= 4; - n |= parity[key[7]]; - w = 0x88888888L; - - if ((*flags & CRYPTO_TFM_REQ_WEAK_KEY) - && !((n - (w >> 3)) & w)) { /* 1 in 10^10 keys passes this test */ - if (n < 0x41415151) { - if (n < 0x31312121) { - if (n < 0x14141515) { - /* 01 01 01 01 01 01 01 01 */ - if (n == 0x11111111) goto weak; - /* 01 1F 01 1F 01 0E 01 0E */ - if (n == 0x13131212) goto weak; - } else { - /* 01 E0 01 E0 01 F1 01 F1 */ - if (n == 0x14141515) goto weak; - /* 01 FE 01 FE 01 FE 01 FE */ - if (n == 0x16161616) goto weak; - } - } else { - if (n < 0x34342525) { - /* 1F 01 1F 01 0E 01 0E 01 */ - if (n == 0x31312121) goto weak; - /* 1F 1F 1F 1F 0E 0E 0E 0E (?) */ - if (n == 0x33332222) goto weak; - } else { - /* 1F E0 1F E0 0E F1 0E F1 */ - if (n == 0x34342525) goto weak; - /* 1F FE 1F FE 0E FE 0E FE */ - if (n == 0x36362626) goto weak; - } - } - } else { - if (n < 0x61616161) { - if (n < 0x44445555) { - /* E0 01 E0 01 F1 01 F1 01 */ - if (n == 0x41415151) goto weak; - /* E0 1F E0 1F F1 0E F1 0E */ - if (n == 0x43435252) goto weak; - } else { - /* E0 E0 E0 E0 F1 F1 F1 F1 (?) */ - if (n == 0x44445555) goto weak; - /* E0 FE E0 FE F1 FE F1 FE */ - if (n == 0x46465656) goto weak; - } - } else { - if (n < 0x64646565) { - /* FE 01 FE 01 FE 01 FE 01 */ - if (n == 0x61616161) goto weak; - /* FE 1F FE 1F FE 0E FE 0E */ - if (n == 0x63636262) goto weak; - } else { - /* FE E0 FE E0 FE F1 FE F1 */ - if (n == 0x64646565) goto weak; - /* FE FE FE FE FE FE FE FE */ - if (n == 0x66666666) goto weak; - } - } - } - } - return 0; -weak: - *flags |= CRYPTO_TFM_RES_WEAK_KEY; - return -EINVAL; -} - -EXPORT_SYMBOL(crypto_des_check_key); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Key Check function for DES & DES3 Cipher Algorithms"); diff -uprN linux-2.6.32/arch/s390/crypto/des_s390.c linux-2.6.32.ovz/arch/s390/crypto/des_s390.c --- linux-2.6.32/arch/s390/crypto/des_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/des_s390.c 2015-03-10 13:24:13.000000000 -0700 @@ -3,7 +3,7 @@ * * s390 implementation of the DES Cipher Algorithm. * - * Copyright IBM Corp. 2003,2007 + * Copyright IBM Corp. 2003,2011 * Author(s): Thomas Spatzier * Jan Glauber (jan.glauber@de.ibm.com) * @@ -14,63 +14,53 @@ * */ -#include #include #include +#include +#include +#include #include "crypt_s390.h" -#include "crypto_des.h" -#define DES_BLOCK_SIZE 8 -#define DES_KEY_SIZE 8 +#define DES3_KEY_SIZE (3 * DES_KEY_SIZE) -#define DES3_128_KEY_SIZE (2 * DES_KEY_SIZE) -#define DES3_128_BLOCK_SIZE DES_BLOCK_SIZE +static u8 *ctrblk; +static DEFINE_SPINLOCK(ctrblk_lock); -#define DES3_192_KEY_SIZE (3 * DES_KEY_SIZE) -#define DES3_192_BLOCK_SIZE DES_BLOCK_SIZE - -struct crypt_s390_des_ctx { +struct s390_des_ctx { u8 iv[DES_BLOCK_SIZE]; - u8 key[DES_KEY_SIZE]; -}; - -struct crypt_s390_des3_128_ctx { - u8 iv[DES_BLOCK_SIZE]; - u8 key[DES3_128_KEY_SIZE]; -}; - -struct crypt_s390_des3_192_ctx { - u8 iv[DES_BLOCK_SIZE]; - u8 key[DES3_192_KEY_SIZE]; + u8 key[DES3_KEY_SIZE]; }; static int des_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) + unsigned int key_len) { - struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; - int ret; + u32 tmp[DES_EXPKEY_WORDS]; - /* test if key is valid (not a weak key) */ - ret = crypto_des_check_key(key, keylen, flags); - if (ret == 0) - memcpy(dctx->key, key, keylen); - return ret; + /* check for weak keys */ + if (!des_ekey(tmp, key) && (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { + *flags |= CRYPTO_TFM_RES_WEAK_KEY; + return -EINVAL; + } + + memcpy(ctx->key, key, key_len); + return 0; } static void des_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_DEA_ENCRYPT, dctx->key, out, in, DES_BLOCK_SIZE); + crypt_s390_km(KM_DEA_ENCRYPT, ctx->key, out, in, DES_BLOCK_SIZE); } static void des_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { - struct crypt_s390_des_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_DEA_DECRYPT, dctx->key, out, in, DES_BLOCK_SIZE); + crypt_s390_km(KM_DEA_DECRYPT, ctx->key, out, in, DES_BLOCK_SIZE); } static struct crypto_alg des_alg = { @@ -79,7 +69,7 @@ static struct crypto_alg des_alg = { .cra_priority = CRYPT_S390_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des_ctx), + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(des_alg.cra_list), .cra_u = { @@ -94,7 +84,7 @@ static struct crypto_alg des_alg = { }; static int ecb_desall_crypt(struct blkcipher_desc *desc, long func, - void *param, struct blkcipher_walk *walk) + u8 *key, struct blkcipher_walk *walk) { int ret = blkcipher_walk_virt(desc, walk); unsigned int nbytes; @@ -105,8 +95,9 @@ static int ecb_desall_crypt(struct blkci u8 *out = walk->dst.virt.addr; u8 *in = walk->src.virt.addr; - ret = crypt_s390_km(func, param, out, in, n); - BUG_ON((ret < 0) || (ret != n)); + ret = crypt_s390_km(func, key, out, in, n); + if (ret < 0 || ret != n) + return -EIO; nbytes &= DES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, walk, nbytes); @@ -116,28 +107,35 @@ static int ecb_desall_crypt(struct blkci } static int cbc_desall_crypt(struct blkcipher_desc *desc, long func, - void *param, struct blkcipher_walk *walk) + struct blkcipher_walk *walk) { + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); int ret = blkcipher_walk_virt(desc, walk); unsigned int nbytes = walk->nbytes; + struct { + u8 iv[DES_BLOCK_SIZE]; + u8 key[DES3_KEY_SIZE]; + } param; if (!nbytes) goto out; - memcpy(param, walk->iv, DES_BLOCK_SIZE); + memcpy(param.iv, walk->iv, DES_BLOCK_SIZE); + memcpy(param.key, ctx->key, DES3_KEY_SIZE); do { /* only use complete blocks */ unsigned int n = nbytes & ~(DES_BLOCK_SIZE - 1); u8 *out = walk->dst.virt.addr; u8 *in = walk->src.virt.addr; - ret = crypt_s390_kmc(func, param, out, in, n); - BUG_ON((ret < 0) || (ret != n)); + ret = crypt_s390_kmc(func, ¶m, out, in, n); + if (ret < 0 || ret != n) + return -EIO; nbytes &= DES_BLOCK_SIZE - 1; ret = blkcipher_walk_done(desc, walk, nbytes); } while ((nbytes = walk->nbytes)); - memcpy(walk->iv, param, DES_BLOCK_SIZE); + memcpy(walk->iv, param.iv, DES_BLOCK_SIZE); out: return ret; @@ -147,22 +145,22 @@ static int ecb_des_encrypt(struct blkcip struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct crypt_s390_des_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_DEA_ENCRYPT, sctx->key, &walk); + return ecb_desall_crypt(desc, KM_DEA_ENCRYPT, ctx->key, &walk); } static int ecb_des_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct crypt_s390_des_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_DEA_DECRYPT, sctx->key, &walk); + return ecb_desall_crypt(desc, KM_DEA_DECRYPT, ctx->key, &walk); } static struct crypto_alg ecb_des_alg = { @@ -171,7 +169,7 @@ static struct crypto_alg ecb_des_alg = { .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des_ctx), + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(ecb_des_alg.cra_list), @@ -190,22 +188,20 @@ static int cbc_des_encrypt(struct blkcip struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct crypt_s390_des_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_DEA_ENCRYPT, sctx->iv, &walk); + return cbc_desall_crypt(desc, KMC_DEA_ENCRYPT, &walk); } static int cbc_des_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) { - struct crypt_s390_des_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_DEA_DECRYPT, sctx->iv, &walk); + return cbc_desall_crypt(desc, KMC_DEA_DECRYPT, &walk); } static struct crypto_alg cbc_des_alg = { @@ -214,7 +210,7 @@ static struct crypto_alg cbc_des_alg = { .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, .cra_blocksize = DES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des_ctx), + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT(cbc_des_alg.cra_list), @@ -237,327 +233,324 @@ static struct crypto_alg cbc_des_alg = { * complementation keys. Any weakness is obviated by the use of * multiple keys. * - * However, if the two independent 64-bit keys are equal, - * then the DES3 operation is simply the same as DES. - * Implementers MUST reject keys that exhibit this property. + * However, if the first two or last two independent 64-bit keys are + * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the + * same as DES. Implementers MUST reject keys that exhibit this + * property. * */ -static int des3_128_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int key_len) { - int i, ret; - struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm); - const u8 *temp_key = key; + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); u32 *flags = &tfm->crt_flags; - if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE)) && + if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) && + memcmp(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2], + DES_KEY_SIZE)) && (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { *flags |= CRYPTO_TFM_RES_WEAK_KEY; return -EINVAL; } - for (i = 0; i < 2; i++, temp_key += DES_KEY_SIZE) { - ret = crypto_des_check_key(temp_key, DES_KEY_SIZE, flags); - if (ret < 0) - return ret; - } - memcpy(dctx->key, key, keylen); + memcpy(ctx->key, key, key_len); return 0; } -static void des3_128_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +static void des3_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_TDEA_128_ENCRYPT, dctx->key, dst, (void*)src, - DES3_128_BLOCK_SIZE); + crypt_s390_km(KM_TDEA_192_ENCRYPT, ctx->key, dst, src, DES_BLOCK_SIZE); } -static void des3_128_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +static void des3_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - struct crypt_s390_des3_128_ctx *dctx = crypto_tfm_ctx(tfm); + struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); - crypt_s390_km(KM_TDEA_128_DECRYPT, dctx->key, dst, (void*)src, - DES3_128_BLOCK_SIZE); + crypt_s390_km(KM_TDEA_192_DECRYPT, ctx->key, dst, src, DES_BLOCK_SIZE); } -static struct crypto_alg des3_128_alg = { - .cra_name = "des3_ede128", - .cra_driver_name = "des3_ede128-s390", +static struct crypto_alg des3_alg = { + .cra_name = "des3_ede", + .cra_driver_name = "des3_ede-s390", .cra_priority = CRYPT_S390_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = DES3_128_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_128_ctx), + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(des3_128_alg.cra_list), + .cra_list = LIST_HEAD_INIT(des3_alg.cra_list), .cra_u = { .cipher = { - .cia_min_keysize = DES3_128_KEY_SIZE, - .cia_max_keysize = DES3_128_KEY_SIZE, - .cia_setkey = des3_128_setkey, - .cia_encrypt = des3_128_encrypt, - .cia_decrypt = des3_128_decrypt, + .cia_min_keysize = DES3_KEY_SIZE, + .cia_max_keysize = DES3_KEY_SIZE, + .cia_setkey = des3_setkey, + .cia_encrypt = des3_encrypt, + .cia_decrypt = des3_decrypt, } } }; -static int ecb_des3_128_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_des3_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_128_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_128_ENCRYPT, sctx->key, &walk); + return ecb_desall_crypt(desc, KM_TDEA_192_ENCRYPT, ctx->key, &walk); } -static int ecb_des3_128_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ecb_des3_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_128_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_128_DECRYPT, sctx->key, &walk); + return ecb_desall_crypt(desc, KM_TDEA_192_DECRYPT, ctx->key, &walk); } -static struct crypto_alg ecb_des3_128_alg = { - .cra_name = "ecb(des3_ede128)", - .cra_driver_name = "ecb-des3_ede128-s390", +static struct crypto_alg ecb_des3_alg = { + .cra_name = "ecb(des3_ede)", + .cra_driver_name = "ecb-des3_ede-s390", .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_128_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_128_ctx), + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT( - ecb_des3_128_alg.cra_list), + ecb_des3_alg.cra_list), .cra_u = { .blkcipher = { - .min_keysize = DES3_128_KEY_SIZE, - .max_keysize = DES3_128_KEY_SIZE, - .setkey = des3_128_setkey, - .encrypt = ecb_des3_128_encrypt, - .decrypt = ecb_des3_128_decrypt, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .setkey = des3_setkey, + .encrypt = ecb_des3_encrypt, + .decrypt = ecb_des3_decrypt, } } }; -static int cbc_des3_128_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_des3_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_128_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_128_ENCRYPT, sctx->iv, &walk); + return cbc_desall_crypt(desc, KMC_TDEA_192_ENCRYPT, &walk); } -static int cbc_des3_128_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int cbc_des3_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_128_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_128_DECRYPT, sctx->iv, &walk); + return cbc_desall_crypt(desc, KMC_TDEA_192_DECRYPT, &walk); } -static struct crypto_alg cbc_des3_128_alg = { - .cra_name = "cbc(des3_ede128)", - .cra_driver_name = "cbc-des3_ede128-s390", +static struct crypto_alg cbc_des3_alg = { + .cra_name = "cbc(des3_ede)", + .cra_driver_name = "cbc-des3_ede-s390", .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_128_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_128_ctx), + .cra_blocksize = DES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, .cra_list = LIST_HEAD_INIT( - cbc_des3_128_alg.cra_list), + cbc_des3_alg.cra_list), .cra_u = { .blkcipher = { - .min_keysize = DES3_128_KEY_SIZE, - .max_keysize = DES3_128_KEY_SIZE, - .ivsize = DES3_128_BLOCK_SIZE, - .setkey = des3_128_setkey, - .encrypt = cbc_des3_128_encrypt, - .decrypt = cbc_des3_128_decrypt, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des3_setkey, + .encrypt = cbc_des3_encrypt, + .decrypt = cbc_des3_decrypt, } } }; -/* - * RFC2451: - * - * For DES-EDE3, there is no known need to reject weak or - * complementation keys. Any weakness is obviated by the use of - * multiple keys. - * - * However, if the first two or last two independent 64-bit keys are - * equal (k1 == k2 or k2 == k3), then the DES3 operation is simply the - * same as DES. Implementers MUST reject keys that exhibit this - * property. - * - */ -static int des3_192_setkey(struct crypto_tfm *tfm, const u8 *key, - unsigned int keylen) +static unsigned int __ctrblk_init(u8 *ctrptr, unsigned int nbytes) { - int i, ret; - struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm); - const u8 *temp_key = key; - u32 *flags = &tfm->crt_flags; + unsigned int i, n; - if (!(memcmp(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) && - memcmp(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2], - DES_KEY_SIZE)) && - (*flags & CRYPTO_TFM_REQ_WEAK_KEY)) { - *flags |= CRYPTO_TFM_RES_WEAK_KEY; - return -EINVAL; + /* align to block size, max. PAGE_SIZE */ + n = (nbytes > PAGE_SIZE) ? PAGE_SIZE : nbytes & ~(DES_BLOCK_SIZE - 1); + for (i = DES_BLOCK_SIZE; i < n; i += DES_BLOCK_SIZE) { + memcpy(ctrptr + i, ctrptr + i - DES_BLOCK_SIZE, DES_BLOCK_SIZE); + crypto_inc(ctrptr + i, DES_BLOCK_SIZE); + } + return n; +} + +static int ctr_desall_crypt(struct blkcipher_desc *desc, long func, + struct s390_des_ctx *ctx, + struct blkcipher_walk *walk) +{ + int ret = blkcipher_walk_virt_block(desc, walk, DES_BLOCK_SIZE); + unsigned int n, nbytes; + u8 buf[DES_BLOCK_SIZE], ctrbuf[DES_BLOCK_SIZE]; + u8 *out, *in, *ctrptr = ctrbuf; + + if (!walk->nbytes) + return ret; + + if (spin_trylock(&ctrblk_lock)) + ctrptr = ctrblk; + + memcpy(ctrptr, walk->iv, DES_BLOCK_SIZE); + while ((nbytes = walk->nbytes) >= DES_BLOCK_SIZE) { + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + while (nbytes >= DES_BLOCK_SIZE) { + if (ctrptr == ctrblk) + n = __ctrblk_init(ctrptr, nbytes); + else + n = DES_BLOCK_SIZE; + ret = crypt_s390_kmctr(func, ctx->key, out, in, + n, ctrptr); + if (ret < 0 || ret != n) { + if (ctrptr == ctrblk) + spin_unlock(&ctrblk_lock); + return -EIO; + } + if (n > DES_BLOCK_SIZE) + memcpy(ctrptr, ctrptr + n - DES_BLOCK_SIZE, + DES_BLOCK_SIZE); + crypto_inc(ctrptr, DES_BLOCK_SIZE); + out += n; + in += n; + nbytes -= n; + } + ret = blkcipher_walk_done(desc, walk, nbytes); } - for (i = 0; i < 3; i++, temp_key += DES_KEY_SIZE) { - ret = crypto_des_check_key(temp_key, DES_KEY_SIZE, flags); - if (ret < 0) - return ret; + if (ctrptr == ctrblk) { + if (nbytes) + memcpy(ctrbuf, ctrptr, DES_BLOCK_SIZE); + else + memcpy(walk->iv, ctrptr, DES_BLOCK_SIZE); + spin_unlock(&ctrblk_lock); + } else { + if (!nbytes) + memcpy(walk->iv, ctrptr, DES_BLOCK_SIZE); + } + /* final block may be < DES_BLOCK_SIZE, copy only nbytes */ + if (nbytes) { + out = walk->dst.virt.addr; + in = walk->src.virt.addr; + ret = crypt_s390_kmctr(func, ctx->key, buf, in, + DES_BLOCK_SIZE, ctrbuf); + if (ret < 0 || ret != DES_BLOCK_SIZE) + return -EIO; + memcpy(out, buf, nbytes); + crypto_inc(ctrbuf, DES_BLOCK_SIZE); + ret = blkcipher_walk_done(desc, walk, 0); + memcpy(walk->iv, ctrbuf, DES_BLOCK_SIZE); } - memcpy(dctx->key, key, keylen); - return 0; -} - -static void des3_192_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm); - - crypt_s390_km(KM_TDEA_192_ENCRYPT, dctx->key, dst, (void*)src, - DES3_192_BLOCK_SIZE); -} - -static void des3_192_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) -{ - struct crypt_s390_des3_192_ctx *dctx = crypto_tfm_ctx(tfm); - - crypt_s390_km(KM_TDEA_192_DECRYPT, dctx->key, dst, (void*)src, - DES3_192_BLOCK_SIZE); + return ret; } -static struct crypto_alg des3_192_alg = { - .cra_name = "des3_ede", - .cra_driver_name = "des3_ede-s390", - .cra_priority = CRYPT_S390_PRIORITY, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = DES3_192_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_192_ctx), - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(des3_192_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = DES3_192_KEY_SIZE, - .cia_max_keysize = DES3_192_KEY_SIZE, - .cia_setkey = des3_192_setkey, - .cia_encrypt = des3_192_encrypt, - .cia_decrypt = des3_192_decrypt, - } - } -}; - -static int ecb_des3_192_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_des_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_192_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_192_ENCRYPT, sctx->key, &walk); + return ctr_desall_crypt(desc, KMCTR_DEA_ENCRYPT, ctx, &walk); } -static int ecb_des3_192_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_des_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_192_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return ecb_desall_crypt(desc, KM_TDEA_192_DECRYPT, sctx->key, &walk); + return ctr_desall_crypt(desc, KMCTR_DEA_DECRYPT, ctx, &walk); } -static struct crypto_alg ecb_des3_192_alg = { - .cra_name = "ecb(des3_ede)", - .cra_driver_name = "ecb-des3_ede-s390", +static struct crypto_alg ctr_des_alg = { + .cra_name = "ctr(des)", + .cra_driver_name = "ctr-des-s390", .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_192_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_192_ctx), + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT( - ecb_des3_192_alg.cra_list), + .cra_list = LIST_HEAD_INIT(ctr_des_alg.cra_list), .cra_u = { .blkcipher = { - .min_keysize = DES3_192_KEY_SIZE, - .max_keysize = DES3_192_KEY_SIZE, - .setkey = des3_192_setkey, - .encrypt = ecb_des3_192_encrypt, - .decrypt = ecb_des3_192_decrypt, + .min_keysize = DES_KEY_SIZE, + .max_keysize = DES_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des_setkey, + .encrypt = ctr_des_encrypt, + .decrypt = ctr_des_decrypt, } } }; -static int cbc_des3_192_encrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_des3_encrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_192_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_192_ENCRYPT, sctx->iv, &walk); + return ctr_desall_crypt(desc, KMCTR_TDEA_192_ENCRYPT, ctx, &walk); } -static int cbc_des3_192_decrypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int ctr_des3_decrypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct crypt_s390_des3_192_ctx *sctx = crypto_blkcipher_ctx(desc->tfm); + struct s390_des_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); struct blkcipher_walk walk; blkcipher_walk_init(&walk, dst, src, nbytes); - return cbc_desall_crypt(desc, KMC_TDEA_192_DECRYPT, sctx->iv, &walk); + return ctr_desall_crypt(desc, KMCTR_TDEA_192_DECRYPT, ctx, &walk); } -static struct crypto_alg cbc_des3_192_alg = { - .cra_name = "cbc(des3_ede)", - .cra_driver_name = "cbc-des3_ede-s390", +static struct crypto_alg ctr_des3_alg = { + .cra_name = "ctr(des3_ede)", + .cra_driver_name = "ctr-des3_ede-s390", .cra_priority = CRYPT_S390_COMPOSITE_PRIORITY, .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = DES3_192_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypt_s390_des3_192_ctx), + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct s390_des_ctx), .cra_type = &crypto_blkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT( - cbc_des3_192_alg.cra_list), + .cra_list = LIST_HEAD_INIT(ctr_des3_alg.cra_list), .cra_u = { .blkcipher = { - .min_keysize = DES3_192_KEY_SIZE, - .max_keysize = DES3_192_KEY_SIZE, - .ivsize = DES3_192_BLOCK_SIZE, - .setkey = des3_192_setkey, - .encrypt = cbc_des3_192_encrypt, - .decrypt = cbc_des3_192_decrypt, + .min_keysize = DES3_KEY_SIZE, + .max_keysize = DES3_KEY_SIZE, + .ivsize = DES_BLOCK_SIZE, + .setkey = des3_setkey, + .encrypt = ctr_des3_encrypt, + .decrypt = ctr_des3_decrypt, } } }; -static int des_s390_init(void) +static int __init des_s390_init(void) { - int ret = 0; + int ret; - if (!crypt_s390_func_available(KM_DEA_ENCRYPT) || - !crypt_s390_func_available(KM_TDEA_128_ENCRYPT) || - !crypt_s390_func_available(KM_TDEA_192_ENCRYPT)) + if (!crypt_s390_func_available(KM_DEA_ENCRYPT, CRYPT_S390_MSA) || + !crypt_s390_func_available(KM_TDEA_192_ENCRYPT, CRYPT_S390_MSA)) return -EOPNOTSUPP; ret = crypto_register_alg(&des_alg); @@ -569,41 +562,46 @@ static int des_s390_init(void) ret = crypto_register_alg(&cbc_des_alg); if (ret) goto cbc_des_err; - - ret = crypto_register_alg(&des3_128_alg); + ret = crypto_register_alg(&des3_alg); if (ret) - goto des3_128_err; - ret = crypto_register_alg(&ecb_des3_128_alg); + goto des3_err; + ret = crypto_register_alg(&ecb_des3_alg); if (ret) - goto ecb_des3_128_err; - ret = crypto_register_alg(&cbc_des3_128_alg); + goto ecb_des3_err; + ret = crypto_register_alg(&cbc_des3_alg); if (ret) - goto cbc_des3_128_err; - - ret = crypto_register_alg(&des3_192_alg); - if (ret) - goto des3_192_err; - ret = crypto_register_alg(&ecb_des3_192_alg); - if (ret) - goto ecb_des3_192_err; - ret = crypto_register_alg(&cbc_des3_192_alg); - if (ret) - goto cbc_des3_192_err; + goto cbc_des3_err; + if (crypt_s390_func_available(KMCTR_DEA_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4) && + crypt_s390_func_available(KMCTR_TDEA_192_ENCRYPT, + CRYPT_S390_MSA | CRYPT_S390_MSA4)) { + ret = crypto_register_alg(&ctr_des_alg); + if (ret) + goto ctr_des_err; + ret = crypto_register_alg(&ctr_des3_alg); + if (ret) + goto ctr_des3_err; + ctrblk = (u8 *) __get_free_page(GFP_KERNEL); + if (!ctrblk) { + ret = -ENOMEM; + goto ctr_mem_err; + } + } out: return ret; -cbc_des3_192_err: - crypto_unregister_alg(&ecb_des3_192_alg); -ecb_des3_192_err: - crypto_unregister_alg(&des3_192_alg); -des3_192_err: - crypto_unregister_alg(&cbc_des3_128_alg); -cbc_des3_128_err: - crypto_unregister_alg(&ecb_des3_128_alg); -ecb_des3_128_err: - crypto_unregister_alg(&des3_128_alg); -des3_128_err: +ctr_mem_err: + crypto_unregister_alg(&ctr_des3_alg); +ctr_des3_err: + crypto_unregister_alg(&ctr_des_alg); +ctr_des_err: + crypto_unregister_alg(&cbc_des3_alg); +cbc_des3_err: + crypto_unregister_alg(&ecb_des3_alg); +ecb_des3_err: + crypto_unregister_alg(&des3_alg); +des3_err: crypto_unregister_alg(&cbc_des_alg); cbc_des_err: crypto_unregister_alg(&ecb_des_alg); @@ -613,21 +611,23 @@ des_err: goto out; } -static void __exit des_s390_fini(void) +static void __exit des_s390_exit(void) { - crypto_unregister_alg(&cbc_des3_192_alg); - crypto_unregister_alg(&ecb_des3_192_alg); - crypto_unregister_alg(&des3_192_alg); - crypto_unregister_alg(&cbc_des3_128_alg); - crypto_unregister_alg(&ecb_des3_128_alg); - crypto_unregister_alg(&des3_128_alg); + if (ctrblk) { + crypto_unregister_alg(&ctr_des_alg); + crypto_unregister_alg(&ctr_des3_alg); + free_page((unsigned long) ctrblk); + } + crypto_unregister_alg(&cbc_des3_alg); + crypto_unregister_alg(&ecb_des3_alg); + crypto_unregister_alg(&des3_alg); crypto_unregister_alg(&cbc_des_alg); crypto_unregister_alg(&ecb_des_alg); crypto_unregister_alg(&des_alg); } module_init(des_s390_init); -module_exit(des_s390_fini); +module_exit(des_s390_exit); MODULE_ALIAS("des"); MODULE_ALIAS("des3_ede"); diff -uprN linux-2.6.32/arch/s390/crypto/ghash_s390.c linux-2.6.32.ovz/arch/s390/crypto/ghash_s390.c --- linux-2.6.32/arch/s390/crypto/ghash_s390.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/ghash_s390.c 2015-03-10 13:24:13.000000000 -0700 @@ -0,0 +1,167 @@ +/* + * Cryptographic API. + * + * s390 implementation of the GHASH algorithm for GCM (Galois/Counter Mode). + * + * Copyright IBM Corp. 2011 + * Author(s): Gerald Schaefer + */ + +#include +#include + +#include "crypt_s390.h" + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +struct ghash_ctx { + u8 icv[16]; + u8 key[16]; +}; + +struct ghash_desc_ctx { + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->key, key, GHASH_BLOCK_SIZE); + memset(ctx->icv, 0, GHASH_BLOCK_SIZE); + + return 0; +} + +static int ghash_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + unsigned int n; + u8 *buf = dctx->buffer; + int ret; + + if (dctx->bytes) { + u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); + + n = min(srclen, dctx->bytes); + dctx->bytes -= n; + srclen -= n; + + memcpy(pos, src, n); + src += n; + + if (!dctx->bytes) { + ret = crypt_s390_kimd(KIMD_GHASH, ctx, buf, + GHASH_BLOCK_SIZE); + if (ret != GHASH_BLOCK_SIZE) + return -EIO; + } + } + + n = srclen & ~(GHASH_BLOCK_SIZE - 1); + if (n) { + ret = crypt_s390_kimd(KIMD_GHASH, ctx, src, n); + if (ret != n) + return -EIO; + src += n; + srclen -= n; + } + + if (srclen) { + dctx->bytes = GHASH_BLOCK_SIZE - srclen; + memcpy(buf, src, srclen); + } + + return 0; +} + +static int ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +{ + u8 *buf = dctx->buffer; + int ret; + + if (dctx->bytes) { + u8 *pos = buf + (GHASH_BLOCK_SIZE - dctx->bytes); + + memset(pos, 0, dctx->bytes); + + ret = crypt_s390_kimd(KIMD_GHASH, ctx, buf, GHASH_BLOCK_SIZE); + if (ret != GHASH_BLOCK_SIZE) + return -EIO; + } + + dctx->bytes = 0; + return 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + int ret; + + ret = ghash_flush(ctx, dctx); + if (!ret) + memcpy(dst, ctx->icv, GHASH_BLOCK_SIZE); + return ret; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-s390", + .cra_priority = CRYPT_S390_PRIORITY, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list), + }, +}; + +static int __init ghash_mod_init(void) +{ + if (!crypt_s390_func_available(KIMD_GHASH, + CRYPT_S390_MSA | CRYPT_S390_MSA4)) + return -EOPNOTSUPP; + + return crypto_register_shash(&ghash_alg); +} + +static void __exit ghash_mod_exit(void) +{ + crypto_unregister_shash(&ghash_alg); +} + +module_init(ghash_mod_init); +module_exit(ghash_mod_exit); + +MODULE_ALIAS("ghash"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GHASH Message Digest Algorithm, s390 implementation"); diff -uprN linux-2.6.32/arch/s390/crypto/Makefile linux-2.6.32.ovz/arch/s390/crypto/Makefile --- linux-2.6.32/arch/s390/crypto/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/Makefile 2015-03-10 13:24:13.000000000 -0700 @@ -5,6 +5,7 @@ obj-$(CONFIG_CRYPTO_SHA1_S390) += sha1_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA256_S390) += sha256_s390.o sha_common.o obj-$(CONFIG_CRYPTO_SHA512_S390) += sha512_s390.o sha_common.o -obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o des_check_key.o +obj-$(CONFIG_CRYPTO_DES_S390) += des_s390.o obj-$(CONFIG_CRYPTO_AES_S390) += aes_s390.o obj-$(CONFIG_S390_PRNG) += prng.o +obj-$(CONFIG_CRYPTO_GHASH_S390) += ghash_s390.o diff -uprN linux-2.6.32/arch/s390/crypto/prng.c linux-2.6.32.ovz/arch/s390/crypto/prng.c --- linux-2.6.32/arch/s390/crypto/prng.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/prng.c 2015-03-10 13:22:27.000000000 -0700 @@ -77,7 +77,7 @@ static void prng_seed(int nbytes) /* Add the entropy */ while (nbytes >= 8) { - *((__u64 *)parm_block) ^= *((__u64 *)buf+i*8); + *((__u64 *)parm_block) ^= *((__u64 *)(buf+i)); prng_add_entropy(); i += 8; nbytes -= 8; @@ -166,7 +166,7 @@ static int __init prng_init(void) int ret; /* check if the CPU has a PRNG */ - if (!crypt_s390_func_available(KMC_PRNG)) + if (!crypt_s390_func_available(KMC_PRNG, CRYPT_S390_MSA)) return -EOPNOTSUPP; if (prng_chunk_size < 8) diff -uprN linux-2.6.32/arch/s390/crypto/sha1_s390.c linux-2.6.32.ovz/arch/s390/crypto/sha1_s390.c --- linux-2.6.32/arch/s390/crypto/sha1_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/sha1_s390.c 2015-03-10 13:22:27.000000000 -0700 @@ -90,7 +90,7 @@ static struct shash_alg alg = { static int __init sha1_s390_init(void) { - if (!crypt_s390_func_available(KIMD_SHA_1)) + if (!crypt_s390_func_available(KIMD_SHA_1, CRYPT_S390_MSA)) return -EOPNOTSUPP; return crypto_register_shash(&alg); } diff -uprN linux-2.6.32/arch/s390/crypto/sha256_s390.c linux-2.6.32.ovz/arch/s390/crypto/sha256_s390.c --- linux-2.6.32/arch/s390/crypto/sha256_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/sha256_s390.c 2015-03-10 13:22:27.000000000 -0700 @@ -86,7 +86,7 @@ static struct shash_alg alg = { static int sha256_s390_init(void) { - if (!crypt_s390_func_available(KIMD_SHA_256)) + if (!crypt_s390_func_available(KIMD_SHA_256, CRYPT_S390_MSA)) return -EOPNOTSUPP; return crypto_register_shash(&alg); diff -uprN linux-2.6.32/arch/s390/crypto/sha512_s390.c linux-2.6.32.ovz/arch/s390/crypto/sha512_s390.c --- linux-2.6.32/arch/s390/crypto/sha512_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/sha512_s390.c 2015-03-10 13:22:27.000000000 -0700 @@ -132,7 +132,7 @@ static int __init init(void) { int ret; - if (!crypt_s390_func_available(KIMD_SHA_512)) + if (!crypt_s390_func_available(KIMD_SHA_512, CRYPT_S390_MSA)) return -EOPNOTSUPP; if ((ret = crypto_register_shash(&sha512_alg)) < 0) goto out; diff -uprN linux-2.6.32/arch/s390/crypto/sha_common.c linux-2.6.32.ovz/arch/s390/crypto/sha_common.c --- linux-2.6.32/arch/s390/crypto/sha_common.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/crypto/sha_common.c 2015-03-10 13:24:13.000000000 -0700 @@ -35,16 +35,19 @@ int s390_sha_update(struct shash_desc *d if (index) { memcpy(ctx->buf + index, data, bsize - index); ret = crypt_s390_kimd(ctx->func, ctx->state, ctx->buf, bsize); - BUG_ON(ret != bsize); + if (ret != bsize) + return -EIO; data += bsize - index; len -= bsize - index; + index = 0; } /* process as many blocks as possible */ if (len >= bsize) { ret = crypt_s390_kimd(ctx->func, ctx->state, data, len & ~(bsize - 1)); - BUG_ON(ret != (len & ~(bsize - 1))); + if (ret != (len & ~(bsize - 1))) + return -EIO; data += ret; len -= ret; } @@ -86,7 +89,8 @@ int s390_sha_final(struct shash_desc *de memcpy(ctx->buf + end - 8, &bits, sizeof(bits)); ret = crypt_s390_kimd(ctx->func, ctx->state, ctx->buf, end); - BUG_ON(ret != end); + if (ret != end) + return -EIO; /* copy digest to out */ memcpy(out, ctx->state, crypto_shash_digestsize(desc->tfm)); diff -uprN linux-2.6.32/arch/s390/hypfs/hypfs_dbfs.c linux-2.6.32.ovz/arch/s390/hypfs/hypfs_dbfs.c --- linux-2.6.32/arch/s390/hypfs/hypfs_dbfs.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/hypfs_dbfs.c 2015-03-10 13:22:07.000000000 -0700 @@ -0,0 +1,115 @@ +/* + * Hypervisor filesystem for Linux on s390 - debugfs interface + * + * Copyright (C) IBM Corp. 2010 + * Author(s): Michael Holzheu + */ + +#include +#include "hypfs.h" + +static struct dentry *dbfs_dir; + +static struct hypfs_dbfs_data *hypfs_dbfs_data_alloc(struct hypfs_dbfs_file *f) +{ + struct hypfs_dbfs_data *data; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + kref_init(&data->kref); + data->dbfs_file = f; + return data; +} + +static void hypfs_dbfs_data_free(struct kref *kref) +{ + struct hypfs_dbfs_data *data; + + data = container_of(kref, struct hypfs_dbfs_data, kref); + data->dbfs_file->data_free(data->buf_free_ptr); + kfree(data); +} + +static void data_free_delayed(struct work_struct *work) +{ + struct hypfs_dbfs_data *data; + struct hypfs_dbfs_file *df; + + df = container_of(work, struct hypfs_dbfs_file, data_free_work.work); + mutex_lock(&df->lock); + data = df->data; + df->data = NULL; + mutex_unlock(&df->lock); + kref_put(&data->kref, hypfs_dbfs_data_free); +} + +static ssize_t dbfs_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct hypfs_dbfs_data *data; + struct hypfs_dbfs_file *df; + ssize_t rc; + + if (*ppos != 0) + return 0; + + df = file->f_path.dentry->d_inode->i_private; + mutex_lock(&df->lock); + if (!df->data) { + data = hypfs_dbfs_data_alloc(df); + if (!data) { + mutex_unlock(&df->lock); + return -ENOMEM; + } + rc = df->data_create(&data->buf, &data->buf_free_ptr, + &data->size); + if (rc) { + mutex_unlock(&df->lock); + kfree(data); + return rc; + } + df->data = data; + schedule_delayed_work(&df->data_free_work, HZ); + } + data = df->data; + kref_get(&data->kref); + mutex_unlock(&df->lock); + + rc = simple_read_from_buffer(buf, size, ppos, data->buf, data->size); + kref_put(&data->kref, hypfs_dbfs_data_free); + return rc; +} + +static const struct file_operations dbfs_ops = { + .read = dbfs_read, +}; + +int hypfs_dbfs_create_file(struct hypfs_dbfs_file *df) +{ + df->dentry = debugfs_create_file(df->name, 0400, dbfs_dir, df, + &dbfs_ops); + if (IS_ERR(df->dentry)) + return PTR_ERR(df->dentry); + mutex_init(&df->lock); + INIT_DELAYED_WORK(&df->data_free_work, data_free_delayed); + return 0; +} + +void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df) +{ + debugfs_remove(df->dentry); +} + +int hypfs_dbfs_init(void) +{ + dbfs_dir = debugfs_create_dir("s390_hypfs", NULL); + if (IS_ERR(dbfs_dir)) + return PTR_ERR(dbfs_dir); + return 0; +} + +void hypfs_dbfs_exit(void) +{ + debugfs_remove(dbfs_dir); +} diff -uprN linux-2.6.32/arch/s390/hypfs/hypfs_diag.c linux-2.6.32.ovz/arch/s390/hypfs/hypfs_diag.c --- linux-2.6.32/arch/s390/hypfs/hypfs_diag.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/hypfs_diag.c 2015-03-10 13:22:07.000000000 -0700 @@ -12,10 +12,10 @@ #include #include -#include #include #include #include +#include #include #include "hypfs.h" @@ -23,6 +23,8 @@ #define CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */ #define TMP_SIZE 64 /* size of temporary buffers */ +#define DBFS_D204_HDR_VERSION 0 + /* diag 204 subcodes */ enum diag204_sc { SUBC_STIB4 = 4, @@ -48,6 +50,8 @@ static void *diag204_buf; /* 4K aligned static void *diag204_buf_vmalloc; /* vmalloc pointer for diag204 data */ static int diag204_buf_pages; /* number of pages for diag204 data */ +static struct dentry *dbfs_d204_file; + /* * DIAG 204 data structures and member access functions. * @@ -164,7 +168,7 @@ static inline void part_hdr__part_name(e LPAR_NAME_LEN); EBCASC(name, LPAR_NAME_LEN); name[LPAR_NAME_LEN] = 0; - strstrip(name); + strim(name); } struct cpu_info { @@ -365,18 +369,21 @@ static void diag204_free_buffer(void) } else { free_pages((unsigned long) diag204_buf, 0); } - diag204_buf_pages = 0; diag204_buf = NULL; } +static void *page_align_ptr(void *ptr) +{ + return (void *) PAGE_ALIGN((unsigned long) ptr); +} + static void *diag204_alloc_vbuf(int pages) { /* The buffer has to be page aligned! */ diag204_buf_vmalloc = vmalloc(PAGE_SIZE * (pages + 1)); if (!diag204_buf_vmalloc) return ERR_PTR(-ENOMEM); - diag204_buf = (void*)((unsigned long)diag204_buf_vmalloc - & ~0xfffUL) + 0x1000; + diag204_buf = page_align_ptr(diag204_buf_vmalloc); diag204_buf_pages = pages; return diag204_buf; } @@ -469,17 +476,26 @@ fail_alloc: return rc; } +static int diag204_do_store(void *buf, int pages) +{ + int rc; + + rc = diag204((unsigned long) diag204_store_sc | + (unsigned long) diag204_info_type, pages, buf); + return rc < 0 ? -ENOSYS : 0; +} + static void *diag204_store(void) { void *buf; - int pages; + int pages, rc; buf = diag204_get_buffer(diag204_info_type, &pages); if (IS_ERR(buf)) goto out; - if (diag204((unsigned long)diag204_store_sc | - (unsigned long)diag204_info_type, pages, buf) < 0) - return ERR_PTR(-ENOSYS); + rc = diag204_do_store(buf, pages); + if (rc) + return ERR_PTR(rc); out: return buf; } @@ -488,7 +504,7 @@ out: static int diag224(void *ptr) { - int rc = -ENOTSUPP; + int rc = -EOPNOTSUPP; asm volatile( " diag %1,%2,0x224\n" @@ -507,7 +523,7 @@ static int diag224_get_name_table(void) return -ENOMEM; if (diag224(diag224_cpu_names)) { kfree(diag224_cpu_names); - return -ENOTSUPP; + return -EOPNOTSUPP; } EBCASC(diag224_cpu_names + 16, (*diag224_cpu_names + 1) * 16); return 0; @@ -523,10 +539,54 @@ static int diag224_idx2name(int index, c memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN), CPU_NAME_LEN); name[CPU_NAME_LEN] = 0; - strstrip(name); + strim(name); + return 0; +} + +struct dbfs_d204_hdr { + u64 len; /* Length of d204 buffer without header */ + u16 version; /* Version of header */ + u8 sc; /* Used subcode */ + char reserved[53]; +} __attribute__ ((packed)); + +struct dbfs_d204 { + struct dbfs_d204_hdr hdr; /* 64 byte header */ + char buf[]; /* d204 buffer */ +} __attribute__ ((packed)); + +static int dbfs_d204_create(void **data, void **data_free_ptr, size_t *size) +{ + struct dbfs_d204 *d204; + int rc, buf_size; + void *base; + + buf_size = PAGE_SIZE * (diag204_buf_pages + 1) + sizeof(d204->hdr); + base = vmalloc(buf_size); + if (!base) + return -ENOMEM; + memset(base, 0, buf_size); + d204 = page_align_ptr(base + sizeof(d204->hdr)) - sizeof(d204->hdr); + rc = diag204_do_store(d204->buf, diag204_buf_pages); + if (rc) { + vfree(base); + return rc; + } + d204->hdr.version = DBFS_D204_HDR_VERSION; + d204->hdr.len = PAGE_SIZE * diag204_buf_pages; + d204->hdr.sc = diag204_store_sc; + *data = d204; + *data_free_ptr = base; + *size = d204->hdr.len + sizeof(struct dbfs_d204_hdr); return 0; } +static struct hypfs_dbfs_file dbfs_file_d204 = { + .name = "diag_204", + .data_create = dbfs_d204_create, + .data_free = vfree, +}; + __init int hypfs_diag_init(void) { int rc; @@ -535,19 +595,29 @@ __init int hypfs_diag_init(void) pr_err("The hardware system does not support hypfs\n"); return -ENODATA; } - rc = diag224_get_name_table(); - if (rc) { - diag204_free_buffer(); - pr_err("The hardware system does not provide all " - "functions required by hypfs\n"); + if (diag204_info_type == INFO_EXT) { + rc = hypfs_dbfs_create_file(&dbfs_file_d204); + if (rc) + return rc; } - return rc; + if (!MACHINE_IS_VM) { + rc = diag224_get_name_table(); + if (rc) { + pr_err("The hardware system does not provide all " + "functions required by hypfs\n"); + debugfs_remove(dbfs_d204_file); + return rc; + } + } + return 0; } void hypfs_diag_exit(void) { + debugfs_remove(dbfs_d204_file); diag224_delete_name_table(); diag204_free_buffer(); + hypfs_dbfs_remove_file(&dbfs_file_d204); } /* diff -uprN linux-2.6.32/arch/s390/hypfs/hypfs.h linux-2.6.32.ovz/arch/s390/hypfs/hypfs.h --- linux-2.6.32/arch/s390/hypfs/hypfs.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/hypfs.h 2015-03-10 13:22:07.000000000 -0700 @@ -11,6 +11,9 @@ #include #include +#include +#include +#include #define REG_FILE_MODE 0440 #define UPDATE_FILE_MODE 0220 @@ -34,6 +37,36 @@ extern int hypfs_diag_create_files(struc /* VM Hypervisor */ extern int hypfs_vm_init(void); +extern void hypfs_vm_exit(void); extern int hypfs_vm_create_files(struct super_block *sb, struct dentry *root); +/* debugfs interface */ +struct hypfs_dbfs_file; + +struct hypfs_dbfs_data { + void *buf; + void *buf_free_ptr; + size_t size; + struct hypfs_dbfs_file *dbfs_file;; + struct kref kref; +}; + +struct hypfs_dbfs_file { + const char *name; + int (*data_create)(void **data, void **data_free_ptr, + size_t *size); + void (*data_free)(const void *buf_free_ptr); + + /* Private data for hypfs_dbfs.c */ + struct hypfs_dbfs_data *data; + struct delayed_work data_free_work; + struct mutex lock; + struct dentry *dentry; +}; + +extern int hypfs_dbfs_init(void); +extern void hypfs_dbfs_exit(void); +extern int hypfs_dbfs_create_file(struct hypfs_dbfs_file *df); +extern void hypfs_dbfs_remove_file(struct hypfs_dbfs_file *df); + #endif /* _HYPFS_H_ */ diff -uprN linux-2.6.32/arch/s390/hypfs/hypfs_vm.c linux-2.6.32.ovz/arch/s390/hypfs/hypfs_vm.c --- linux-2.6.32/arch/s390/hypfs/hypfs_vm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/hypfs_vm.c 2015-03-10 13:22:07.000000000 -0700 @@ -10,9 +10,11 @@ #include #include #include +#include #include "hypfs.h" #define NAME_LEN 8 +#define DBFS_D2FC_HDR_VERSION 0 static char local_guest[] = " "; static char all_guests[] = "* "; @@ -76,28 +78,31 @@ static int diag2fc(int size, char* query return -residual_cnt; } -static struct diag2fc_data *diag2fc_store(char *query, int *count) +/* + * Allocate buffer for "query" and store diag 2fc at "offset" + */ +static void *diag2fc_store(char *query, unsigned int *count, int offset) { + void *data; int size; - struct diag2fc_data *data; do { size = diag2fc(0, query, NULL); if (size < 0) return ERR_PTR(-EACCES); - data = vmalloc(size); + data = vmalloc(size + offset); if (!data) return ERR_PTR(-ENOMEM); - if (diag2fc(size, query, data) == 0) + if (diag2fc(size, query, data + offset) == 0) break; vfree(data); } while (1); - *count = (size / sizeof(*data)); + *count = (size / sizeof(struct diag2fc_data)); return data; } -static void diag2fc_free(void *data) +static void diag2fc_free(const void *data) { vfree(data); } @@ -124,7 +129,7 @@ static int hpyfs_vm_create_guest(struct /* guest dir */ memcpy(guest_name, data->guest_name, NAME_LEN); EBCASC(guest_name, NAME_LEN); - strstrip(guest_name); + strim(guest_name); guest_dir = hypfs_mkdir(sb, systems_dir, guest_name); if (IS_ERR(guest_dir)) return PTR_ERR(guest_dir); @@ -168,9 +173,10 @@ int hypfs_vm_create_files(struct super_b { struct dentry *dir, *file; struct diag2fc_data *data; - int rc, i, count = 0; + unsigned int count = 0; + int rc, i; - data = diag2fc_store(guest_query, &count); + data = diag2fc_store(guest_query, &count, 0); if (IS_ERR(data)) return PTR_ERR(data); @@ -218,14 +224,60 @@ failed: return rc; } +struct dbfs_d2fc_hdr { + u64 len; /* Length of d2fc buffer without header */ + u16 version; /* Version of header */ + char tod_ext[16]; /* TOD clock for d2fc */ + u64 count; /* Number of VM guests in d2fc buffer */ + char reserved[30]; +} __attribute__ ((packed)); + +struct dbfs_d2fc { + struct dbfs_d2fc_hdr hdr; /* 64 byte header */ + char buf[]; /* d2fc buffer */ +} __attribute__ ((packed)); + +static int dbfs_diag2fc_create(void **data, void **data_free_ptr, size_t *size) +{ + struct dbfs_d2fc *d2fc; + unsigned int count; + + d2fc = diag2fc_store(guest_query, &count, sizeof(d2fc->hdr)); + if (IS_ERR(d2fc)) + return PTR_ERR(d2fc); + get_clock_ext(d2fc->hdr.tod_ext); + d2fc->hdr.len = count * sizeof(struct diag2fc_data); + d2fc->hdr.version = DBFS_D2FC_HDR_VERSION; + d2fc->hdr.count = count; + memset(&d2fc->hdr.reserved, 0, sizeof(d2fc->hdr.reserved)); + *data = d2fc; + *data_free_ptr = d2fc; + *size = d2fc->hdr.len + sizeof(struct dbfs_d2fc_hdr); + return 0; +} + +static struct hypfs_dbfs_file dbfs_file_2fc = { + .name = "diag_2fc", + .data_create = dbfs_diag2fc_create, + .data_free = diag2fc_free, +}; + int hypfs_vm_init(void) { + if (!MACHINE_IS_VM) + return 0; if (diag2fc(0, all_guests, NULL) > 0) guest_query = all_guests; else if (diag2fc(0, local_guest, NULL) > 0) guest_query = local_guest; else return -EACCES; + return hypfs_dbfs_create_file(&dbfs_file_2fc); +} - return 0; +void hypfs_vm_exit(void) +{ + if (!MACHINE_IS_VM) + return; + hypfs_dbfs_remove_file(&dbfs_file_2fc); } diff -uprN linux-2.6.32/arch/s390/hypfs/inode.c linux-2.6.32.ovz/arch/s390/hypfs/inode.c --- linux-2.6.32/arch/s390/hypfs/inode.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/inode.c 2015-03-10 13:22:07.000000000 -0700 @@ -14,8 +14,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -145,7 +145,7 @@ static int hypfs_open(struct inode *inod } mutex_unlock(&fs_info->lock); } - return 0; + return nonseekable_open(inode, filp); } static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov, @@ -288,46 +288,30 @@ static int hypfs_fill_super(struct super sb->s_blocksize_bits = PAGE_CACHE_SHIFT; sb->s_magic = HYPFS_MAGIC; sb->s_op = &hypfs_s_ops; - if (hypfs_parse_options(data, sb)) { - rc = -EINVAL; - goto err_alloc; - } + if (hypfs_parse_options(data, sb)) + return -EINVAL; root_inode = hypfs_make_inode(sb, S_IFDIR | 0755); - if (!root_inode) { - rc = -ENOMEM; - goto err_alloc; - } + if (!root_inode) + return -ENOMEM; root_inode->i_op = &simple_dir_inode_operations; root_inode->i_fop = &simple_dir_operations; - root_dentry = d_alloc_root(root_inode); + sb->s_root = root_dentry = d_alloc_root(root_inode); if (!root_dentry) { iput(root_inode); - rc = -ENOMEM; - goto err_alloc; + return -ENOMEM; } if (MACHINE_IS_VM) rc = hypfs_vm_create_files(sb, root_dentry); else rc = hypfs_diag_create_files(sb, root_dentry); if (rc) - goto err_tree; + return rc; sbi->update_file = hypfs_create_update_file(sb, root_dentry); - if (IS_ERR(sbi->update_file)) { - rc = PTR_ERR(sbi->update_file); - goto err_tree; - } + if (IS_ERR(sbi->update_file)) + return PTR_ERR(sbi->update_file); hypfs_update_update(sb); - sb->s_root = root_dentry; pr_info("Hypervisor filesystem mounted\n"); return 0; - -err_tree: - hypfs_delete_tree(root_dentry); - d_genocide(root_dentry); - dput(root_dentry); -err_alloc: - kfree(sbi); - return rc; } static int hypfs_get_super(struct file_system_type *fst, int flags, @@ -340,12 +324,12 @@ static void hypfs_kill_super(struct supe { struct hypfs_sb_info *sb_info = sb->s_fs_info; - if (sb->s_root) { + if (sb->s_root) hypfs_delete_tree(sb->s_root); + if (sb_info->update_file) hypfs_remove(sb_info->update_file); - kfree(sb->s_fs_info); - sb->s_fs_info = NULL; - } + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; kill_litter_super(sb); } @@ -484,20 +468,21 @@ static int __init hypfs_init(void) { int rc; - if (MACHINE_IS_VM) { - if (hypfs_vm_init()) - /* no diag 2fc, just exit */ - return -ENODATA; - } else { - if (hypfs_diag_init()) { - rc = -ENODATA; - goto fail_diag; - } + rc = hypfs_dbfs_init(); + if (rc) + return rc; + if (hypfs_diag_init()) { + rc = -ENODATA; + goto fail_dbfs_exit; + } + if (hypfs_vm_init()) { + rc = -ENODATA; + goto fail_hypfs_diag_exit; } s390_kobj = kobject_create_and_add("s390", hypervisor_kobj); if (!s390_kobj) { rc = -ENOMEM; - goto fail_sysfs; + goto fail_hypfs_vm_exit; } rc = register_filesystem(&hypfs_type); if (rc) @@ -506,18 +491,21 @@ static int __init hypfs_init(void) fail_filesystem: kobject_put(s390_kobj); -fail_sysfs: - if (!MACHINE_IS_VM) - hypfs_diag_exit(); -fail_diag: +fail_hypfs_vm_exit: + hypfs_vm_exit(); +fail_hypfs_diag_exit: + hypfs_diag_exit(); +fail_dbfs_exit: + hypfs_dbfs_exit(); pr_err("Initialization of hypfs failed with rc=%i\n", rc); return rc; } static void __exit hypfs_exit(void) { - if (!MACHINE_IS_VM) - hypfs_diag_exit(); + hypfs_diag_exit(); + hypfs_vm_exit(); + hypfs_dbfs_exit(); unregister_filesystem(&hypfs_type); kobject_put(s390_kobj); } diff -uprN linux-2.6.32/arch/s390/hypfs/Makefile linux-2.6.32.ovz/arch/s390/hypfs/Makefile --- linux-2.6.32/arch/s390/hypfs/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/hypfs/Makefile 2015-03-10 13:22:07.000000000 -0700 @@ -4,4 +4,4 @@ obj-$(CONFIG_S390_HYPFS_FS) += s390_hypfs.o -s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o +s390_hypfs-objs := inode.o hypfs_diag.o hypfs_vm.o hypfs_dbfs.o diff -uprN linux-2.6.32/arch/s390/include/asm/atomic.h linux-2.6.32.ovz/arch/s390/include/asm/atomic.h --- linux-2.6.32/arch/s390/include/asm/atomic.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/atomic.h 2015-03-10 13:21:08.000000000 -0700 @@ -21,7 +21,7 @@ #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) #define __CS_LOOP(ptr, op_val, op_string) ({ \ - typeof(ptr->counter) old_val, new_val; \ + int old_val, new_val; \ asm volatile( \ " l %0,%2\n" \ "0: lr %1,%0\n" \ @@ -38,7 +38,7 @@ #else /* __GNUC__ */ #define __CS_LOOP(ptr, op_val, op_string) ({ \ - typeof(ptr->counter) old_val, new_val; \ + int old_val, new_val; \ asm volatile( \ " l %0,0(%3)\n" \ "0: lr %1,%0\n" \ @@ -143,7 +143,7 @@ static inline int atomic_add_unless(atom #if __GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ > 2) #define __CSG_LOOP(ptr, op_val, op_string) ({ \ - typeof(ptr->counter) old_val, new_val; \ + long long old_val, new_val; \ asm volatile( \ " lg %0,%2\n" \ "0: lgr %1,%0\n" \ @@ -160,7 +160,7 @@ static inline int atomic_add_unless(atom #else /* __GNUC__ */ #define __CSG_LOOP(ptr, op_val, op_string) ({ \ - typeof(ptr->counter) old_val, new_val; \ + long long old_val, new_val; \ asm volatile( \ " lg %0,0(%3)\n" \ "0: lgr %1,%0\n" \ diff -uprN linux-2.6.32/arch/s390/include/asm/bitops.h linux-2.6.32.ovz/arch/s390/include/asm/bitops.h --- linux-2.6.32/arch/s390/include/asm/bitops.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/bitops.h 2015-03-10 13:24:00.000000000 -0700 @@ -879,6 +879,33 @@ static inline int ext2_find_next_bit(voi #include +/* Compat macros for RHEL6 */ +#define find_next_zero_bit_le(addr, size, offset) \ + generic_find_next_zero_le_bit(addr, size, offset) +#define find_next_bit_le(addr, size, offset) \ + generic_find_next_le_bit(addr, size, offset) +#define __set_bit_le(nr, addr) \ + __set_bit((nr) ^ (__BITOPS_WORDSIZE - 8), addr) +#define __clear_bit_le(nr, addr) \ + __clear_bit((nr) ^ (__BITOPS_WORDSIZE - 8), addr) + +#define test_bit_le(nr, addr) \ + test_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) +#define set_bit_le(nr, addr) \ + set_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) +#define clear_bit_le(nr, addr) \ + clear_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) + +#define test_and_set_bit_le(nr, addr) \ + test_and_set_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) +#define test_and_clear_bit_le(nr, addr) \ + test_and_clear_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) + +#define __test_and_set_bit_le(nr, addr) \ + __test_and_set_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) +#define __test_and_clear_bit_le(nr, addr) \ + __test_and_clear_bit((nr) ^ (__BITOPS_WORDSIZE - 8), (addr)) + #endif /* __KERNEL__ */ #endif /* _S390_BITOPS_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/bug.h linux-2.6.32.ovz/arch/s390/include/asm/bug.h --- linux-2.6.32/arch/s390/include/asm/bug.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/bug.h 2015-03-10 13:21:40.000000000 -0700 @@ -52,14 +52,18 @@ for (;;); \ } while (0) +#define __WARN_TAINT(taint) do { \ + __EMIT_BUG(BUGFLAG_TAINT(taint)); \ +} while (0) + #define WARN_ON(x) ({ \ int __ret_warn_on = !!(x); \ if (__builtin_constant_p(__ret_warn_on)) { \ if (__ret_warn_on) \ - __EMIT_BUG(BUGFLAG_WARNING); \ + __WARN(); \ } else { \ if (unlikely(__ret_warn_on)) \ - __EMIT_BUG(BUGFLAG_WARNING); \ + __WARN(); \ } \ unlikely(__ret_warn_on); \ }) diff -uprN linux-2.6.32/arch/s390/include/asm/ccwdev.h linux-2.6.32.ovz/arch/s390/include/asm/ccwdev.h --- linux-2.6.32/arch/s390/include/asm/ccwdev.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/ccwdev.h 2015-03-10 13:22:27.000000000 -0700 @@ -91,6 +91,24 @@ struct ccw_device { void (*handler) (struct ccw_device *, unsigned long, struct irb *); }; +/* + * Possible events used by the path_event notifier. + */ +#define PE_NONE 0x0 +#define PE_PATH_GONE 0x1 /* A path is no longer available. */ +#define PE_PATH_AVAILABLE 0x2 /* A path has become available and + was successfully verified. */ +#define PE_PATHGROUP_ESTABLISHED 0x4 /* A pathgroup was reset and had + to be established again. */ + +/* + * Possible CIO actions triggered by the unit check handler. + */ +enum uc_todo { + UC_TODO_RETRY, + UC_TODO_RETRY_ON_NEW_PATH, + UC_TODO_STOP +}; /** * struct ccw driver - device driver for channel attached devices @@ -101,12 +119,14 @@ struct ccw_device { * @set_online: called when setting device online * @set_offline: called when setting device offline * @notify: notify driver of device state changes + * @path_event: notify driver of channel path events * @shutdown: called at device shutdown * @prepare: prepare for pm state transition * @complete: undo work done in @prepare * @freeze: callback for freezing during hibernation snapshotting * @thaw: undo work done in @freeze * @restore: callback for restoring after hibernation + * @uc_handler: callback for unit check handler * @driver: embedded device driver structure * @name: device driver name */ @@ -118,12 +138,14 @@ struct ccw_driver { int (*set_online) (struct ccw_device *); int (*set_offline) (struct ccw_device *); int (*notify) (struct ccw_device *, int); + void (*path_event) (struct ccw_device *, int *); void (*shutdown) (struct ccw_device *); int (*prepare) (struct ccw_device *); void (*complete) (struct ccw_device *); int (*freeze)(struct ccw_device *); int (*thaw) (struct ccw_device *); int (*restore)(struct ccw_device *); + enum uc_todo (*uc_handler) (struct ccw_device *, struct irb *); struct device_driver driver; char *name; }; @@ -142,6 +164,8 @@ struct ccw1; extern int ccw_device_set_options_mask(struct ccw_device *, unsigned long); extern int ccw_device_set_options(struct ccw_device *, unsigned long); extern void ccw_device_clear_options(struct ccw_device *, unsigned long); +int ccw_device_is_pathgroup(struct ccw_device *cdev); +int ccw_device_is_multipath(struct ccw_device *cdev); /* Allow for i/o completion notification after primary interrupt status. */ #define CCWDEV_EARLY_NOTIFICATION 0x0001 @@ -151,6 +175,8 @@ extern void ccw_device_clear_options(str #define CCWDEV_DO_PATHGROUP 0x0004 /* Allow forced onlining of boxed devices. */ #define CCWDEV_ALLOW_FORCE 0x0008 +/* Try to use multipath mode. */ +#define CCWDEV_DO_MULTIPATH 0x0010 extern int ccw_device_start(struct ccw_device *, struct ccw1 *, unsigned long, __u8, unsigned long); @@ -178,6 +204,8 @@ int ccw_device_tm_start_timeout(struct c unsigned long, u8, int); int ccw_device_tm_intrg(struct ccw_device *cdev); +int ccw_device_get_mdc(struct ccw_device *cdev, u8 mask); + extern int ccw_device_set_online(struct ccw_device *cdev); extern int ccw_device_set_offline(struct ccw_device *cdev); @@ -194,6 +222,8 @@ extern void ccw_device_get_id(struct ccw extern struct ccw_device *ccw_device_probe_console(void); extern int ccw_device_force_console(void); +int ccw_device_siosl(struct ccw_device *); + // FIXME: these have to go extern int _ccw_device_get_subchannel_number(struct ccw_device *); diff -uprN linux-2.6.32/arch/s390/include/asm/ccwgroup.h linux-2.6.32.ovz/arch/s390/include/asm/ccwgroup.h --- linux-2.6.32/arch/s390/include/asm/ccwgroup.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/ccwgroup.h 2015-03-10 13:23:55.000000000 -0700 @@ -71,6 +71,9 @@ int ccwgroup_create_from_string(struct d struct ccw_driver *cdrv, int num_devices, const char *buf); +extern int ccwgroup_set_online(struct ccwgroup_device *gdev); +extern int ccwgroup_set_offline(struct ccwgroup_device *gdev); + extern int ccwgroup_probe_ccwdev(struct ccw_device *cdev); extern void ccwgroup_remove_ccwdev(struct ccw_device *cdev); diff -uprN linux-2.6.32/arch/s390/include/asm/chsc.h linux-2.6.32.ovz/arch/s390/include/asm/chsc.h --- linux-2.6.32/arch/s390/include/asm/chsc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/chsc.h 2015-03-10 13:23:44.000000000 -0700 @@ -125,32 +125,4 @@ struct chsc_cpd_info { #define CHSC_INFO_CPD _IOWR(CHSC_IOCTL_MAGIC, 0x87, struct chsc_cpd_info) #define CHSC_INFO_DCAL _IOWR(CHSC_IOCTL_MAGIC, 0x88, struct chsc_dcal) -#ifdef __KERNEL__ - -struct css_general_char { - u64 : 12; - u32 dynio : 1; /* bit 12 */ - u32 : 28; - u32 aif : 1; /* bit 41 */ - u32 : 3; - u32 mcss : 1; /* bit 45 */ - u32 fcs : 1; /* bit 46 */ - u32 : 1; - u32 ext_mb : 1; /* bit 48 */ - u32 : 7; - u32 aif_tdd : 1; /* bit 56 */ - u32 : 1; - u32 qebsm : 1; /* bit 58 */ - u32 : 8; - u32 aif_osa : 1; /* bit 67 */ - u32 : 14; - u32 cib : 1; /* bit 82 */ - u32 : 5; - u32 fcx : 1; /* bit 88 */ - u32 : 7; -}__attribute__((packed)); - -extern struct css_general_char css_general_characteristics; - -#endif /* __KERNEL__ */ #endif diff -uprN linux-2.6.32/arch/s390/include/asm/cio.h linux-2.6.32.ovz/arch/s390/include/asm/cio.h --- linux-2.6.32/arch/s390/include/asm/cio.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/cio.h 2015-03-10 13:23:44.000000000 -0700 @@ -85,6 +85,18 @@ struct erw { } __attribute__ ((packed)); /** + * struct erw_eadm - EADM Subchannel extended report word + * @b: aob error + * @r: arsb error + */ +struct erw_eadm { + __u32 : 16; + __u32 b : 1; + __u32 r : 1; + __u32 : 14; +} __packed; + +/** * struct sublog - subchannel logout area * @res0: reserved * @esf: extended status flags @@ -175,9 +187,22 @@ struct esw3 { } __attribute__ ((packed)); /** + * struct esw_eadm - EADM Subchannel Extended Status Word (ESW) + * @sublog: subchannel logout + * @erw: extended report word + */ +struct esw_eadm { + __u32 sublog; + struct erw_eadm erw; + __u32 : 32; + __u32 : 32; + __u32 : 32; +} __packed; + +/** * struct irb - interruption response block * @scsw: subchannel status word - * @esw: extened status word, 4 formats + * @esw: extened status word * @ecw: extended control word * * The irb that is handed to the device driver when an interrupt occurs. For @@ -196,6 +221,7 @@ struct irb { struct esw1 esw1; struct esw2 esw2; struct esw3 esw3; + struct esw_eadm eadm; } esw; __u8 ecw[32]; } __attribute__ ((packed,aligned(4))); diff -uprN linux-2.6.32/arch/s390/include/asm/compat.h linux-2.6.32.ovz/arch/s390/include/asm/compat.h --- linux-2.6.32/arch/s390/include/asm/compat.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/compat.h 2015-03-10 13:22:27.000000000 -0700 @@ -168,7 +168,7 @@ static inline compat_uptr_t ptr_to_compa static inline int is_compat_task(void) { - return test_thread_flag(TIF_31BIT); + return is_32bit_task(); } #else @@ -180,7 +180,7 @@ static inline int is_compat_task(void) #endif -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { unsigned long stack; diff -uprN linux-2.6.32/arch/s390/include/asm/cpu_mf.h linux-2.6.32.ovz/arch/s390/include/asm/cpu_mf.h --- linux-2.6.32/arch/s390/include/asm/cpu_mf.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/cpu_mf.h 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,111 @@ +/* + * CPU-measurement facilities + * + * Copyright IBM Corp. 2012 + * Author(s): Hendrik Brueckner + * Jan Glauber + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + */ +#ifndef _ASM_S390_CPU_MF_H +#define _ASM_S390_CPU_MF_H + +#define CPU_MF_INT_CF_CACA (1 << 7) /* counter auth. change alert */ +#define CPU_MF_INT_CF_LCDA (1 << 6) /* loss of counter data alert */ +#define CPU_MF_INT_RI_HALTED (1 << 5) /* run-time instr. halted */ +#define CPU_MF_INT_RI_BUF_FULL (1 << 4) /* run-time instr. program + buffer full */ + +#define CPU_MF_INT_CF_MASK (CPU_MF_INT_CF_CACA|CPU_MF_INT_CF_LCDA) +#define CPU_MF_INT_RI_MASK (CPU_MF_INT_RI_HALTED|CPU_MF_INT_RI_BUF_FULL) + +/* CPU measurement facility support */ +static inline int cpum_cf_avail(void) +{ + unsigned long long facility_bits[2]; + + if (!MACHINE_HAS_SPP) + return 0; + + if (stfle(facility_bits, 2) <= 0) + return 0; + if (facility_bits[1] & (1ULL << 60)) + return 1; + else + return 0; +} + +static inline int cpum_sf_avail(void) +{ + unsigned long long facility_bits[2]; + + if (!MACHINE_HAS_SPP) + return 0; + + if (stfle(facility_bits, 2) <= 0) + return 0; + if (facility_bits[1] & (1ULL << 59)) + return 1; + else + return 0; +} + + +struct cpumf_ctr_info { + u16 cfvn; + u16 auth_ctl; + u16 enable_ctl; + u16 act_ctl; + u16 max_cpu; + u16 csvn; + u16 max_cg; + u16 reserved1; + u32 reserved2[12]; +} __packed; + +/* Query counter information */ +static inline int qctri(struct cpumf_ctr_info *info) +{ + int rc = -EINVAL; + + asm volatile ( + "0: .insn s,0xb28e0000,%1\n" + "1: lhi %0,0\n" + "2:\n" + EX_TABLE(1b, 2b) + : "+d" (rc), "=Q" (*info)); + return rc; +} + +/* Load CPU-counter-set controls */ +static inline int lcctl(u64 ctl) +{ + int cc; + + asm volatile ( + " .insn s,0xb2840000,%1\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (cc) : "m" (ctl) : "cc"); + return cc; +} + +/* Extract CPU counter */ +static inline int ecctr(u64 ctr, u64 *val) +{ + register u64 content asm("4") = 0; + int cc; + + asm volatile ( + " .insn rre,0xb2e40000,%0,%2\n" + " ipm %1\n" + " srl %1,28\n" + : "=d" (content), "=d" (cc) : "d" (ctr) : "cc"); + if (!cc) + *val = content; + return cc; +} + +#endif /* _ASM_S390_CPU_MF_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/cputime.h linux-2.6.32.ovz/arch/s390/include/asm/cputime.h --- linux-2.6.32/arch/s390/include/asm/cputime.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/cputime.h 2015-03-10 13:23:16.000000000 -0700 @@ -88,6 +88,23 @@ msecs_to_cputime(const unsigned int m) } /* + * Convert cputime to microseconds and back. + */ +static inline unsigned int +cputime_to_usecs(const cputime_t cputime) +{ + return cputime_div(cputime, 4096); +} + +static inline cputime_t +usecs_to_cputime(const unsigned int m) +{ + return (cputime_t) m * 4096; +} + +#define usecs_to_cputime64(m) usecs_to_cputime(m) + +/* * Convert cputime to milliseconds and back. */ static inline unsigned int @@ -183,6 +200,7 @@ struct s390_idle_data { unsigned long long idle_count; unsigned long long idle_enter; unsigned long long idle_time; + int nohz_delay; }; DECLARE_PER_CPU(struct s390_idle_data, s390_idle); @@ -198,4 +216,11 @@ static inline void s390_idle_check(void) vtime_start_cpu(); } +static inline int s390_nohz_delay(int cpu) +{ + return per_cpu(s390_idle, cpu).nohz_delay != 0; +} + +#define arch_needs_cpu(cpu) s390_nohz_delay(cpu) + #endif /* _S390_CPUTIME_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/crash.h linux-2.6.32.ovz/arch/s390/include/asm/crash.h --- linux-2.6.32/arch/s390/include/asm/crash.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/crash.h 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,61 @@ +#ifndef _S390_CRASH_H +#define _S390_CRASH_H + +#ifdef __KERNEL__ + +#include +#include + + +/* + * For swapped prefix pages get bounce buffer using xlate_dev_mem_ptr() + */ +static inline void *map_virtual(u64 offset, struct page **pp) +{ + struct page *page; + unsigned long pfn; + void *vaddr; + + vaddr = xlate_dev_mem_ptr(offset); + pfn = ((unsigned long) vaddr) >> PAGE_SHIFT; + if ((unsigned long) vaddr != offset) + page = pfn_to_page(pfn); + else + page = NULL; + + if (!page_is_ram(pfn)) { + printk(KERN_INFO + "crash memory driver: !page_is_ram(pfn: %lx)\n", pfn); + return NULL; + } + + if (!pfn_valid(pfn)) { + printk(KERN_INFO + "crash memory driver: invalid pfn: %lx )\n", pfn); + return NULL; + } + + *pp = page; + return vaddr; +} + +/* + * Free bounce buffer if necessary + */ +static inline void unmap_virtual(struct page *page) +{ + void *vaddr; + + if (page) { + /* + * Because for bounce buffers vaddr will never be 0 + * unxlate_dev_mem_ptr() will always free the bounce buffer. + */ + vaddr = (void *)(page_to_pfn(page) << PAGE_SHIFT); + unxlate_dev_mem_ptr(0, vaddr); + } +} + +#endif /* __KERNEL__ */ + +#endif /* _S390_CRASH_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/crw.h linux-2.6.32.ovz/arch/s390/include/asm/crw.h --- linux-2.6.32/arch/s390/include/asm/crw.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/crw.h 2015-03-10 13:22:07.000000000 -0700 @@ -32,6 +32,7 @@ typedef void (*crw_handler_t)(struct crw extern int crw_register_handler(int rsc, crw_handler_t handler); extern void crw_unregister_handler(int rsc); extern void crw_handle_channel_report(void); +void crw_wait_for_channel_report(void); #define NR_RSCS 16 diff -uprN linux-2.6.32/arch/s390/include/asm/css_chars.h linux-2.6.32.ovz/arch/s390/include/asm/css_chars.h --- linux-2.6.32/arch/s390/include/asm/css_chars.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/css_chars.h 2015-03-10 13:23:44.000000000 -0700 @@ -0,0 +1,39 @@ +#ifndef _ASM_CSS_CHARS_H +#define _ASM_CSS_CHARS_H + +#include + +#ifdef __KERNEL__ + +struct css_general_char { + u64 : 12; + u32 dynio : 1; /* bit 12 */ + u32 : 4; + u32 eadm : 1; /* bit 17 */ + u32 : 23; + u32 aif : 1; /* bit 41 */ + u32 : 3; + u32 mcss : 1; /* bit 45 */ + u32 fcs : 1; /* bit 46 */ + u32 : 1; + u32 ext_mb : 1; /* bit 48 */ + u32 : 7; + u32 aif_tdd : 1; /* bit 56 */ + u32 : 1; + u32 qebsm : 1; /* bit 58 */ + u32 : 8; + u32 aif_osa : 1; /* bit 67 */ + u32 : 12; + u32 eadm_rf : 1; /* bit 80 */ + u32 : 1; + u32 cib : 1; /* bit 82 */ + u32 : 5; + u32 fcx : 1; /* bit 88 */ + u32 : 19; + u32 alt_ssi : 1; /* bit 108 */ +} __packed; + +extern struct css_general_char css_general_characteristics; + +#endif /* __KERNEL__ */ +#endif diff -uprN linux-2.6.32/arch/s390/include/asm/dasd.h linux-2.6.32.ovz/arch/s390/include/asm/dasd.h --- linux-2.6.32/arch/s390/include/asm/dasd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/dasd.h 2015-03-10 13:22:27.000000000 -0700 @@ -73,6 +73,7 @@ typedef struct dasd_information2_t { * 0x02: use diag discipline (diag) * 0x04: set the device initially online (internal use only) * 0x08: enable ERP related logging + * 0x20: give access to raw eckd data */ #define DASD_FEATURE_DEFAULT 0x00 #define DASD_FEATURE_READONLY 0x01 @@ -80,6 +81,8 @@ typedef struct dasd_information2_t { #define DASD_FEATURE_INITIAL_ONLINE 0x04 #define DASD_FEATURE_ERPLOG 0x08 #define DASD_FEATURE_FAILFAST 0x10 +#define DASD_FEATURE_FAILONSLCK 0x20 +#define DASD_FEATURE_USERAW 0x40 #define DASD_PARTN_BITS 2 @@ -217,6 +220,25 @@ typedef struct dasd_symmio_parms { int rssd_result_len; } __attribute__ ((packed)) dasd_symmio_parms_t; +/* + * Data returned by Sense Path Group ID (SNID) + */ +struct dasd_snid_data { + struct { + __u8 group:2; + __u8 reserve:2; + __u8 mode:1; + __u8 res:3; + } __attribute__ ((packed)) path_state; + __u8 pgid[11]; +} __attribute__ ((packed)); + +struct dasd_snid_ioctl_data { + struct dasd_snid_data data; + __u8 path_mask; +} __attribute__ ((packed)); + + /******************************************************************************** * SECTION: Definition of IOCTLs * @@ -261,25 +283,10 @@ typedef struct dasd_symmio_parms { /* Set Attributes (cache operations) */ #define BIODASDSATTR _IOW(DASD_IOCTL_LETTER,2,attrib_data_t) +/* Get Sense Path Group ID (SNID) data */ +#define BIODASDSNID _IOWR(DASD_IOCTL_LETTER, 1, struct dasd_snid_ioctl_data) + #define BIODASDSYMMIO _IOWR(DASD_IOCTL_LETTER, 240, dasd_symmio_parms_t) #endif /* DASD_H */ -/* - * Overrides for Emacs so that we follow Linus's tabbing style. - * Emacs will notice this stuff at the end of the file and automatically - * adjust the settings for this buffer only. This must remain at the end - * of the file. - * --------------------------------------------------------------------------- - * Local variables: - * c-indent-level: 4 - * c-brace-imaginary-offset: 0 - * c-brace-offset: -4 - * c-argdecl-indent: 4 - * c-label-offset: -4 - * c-continued-statement-offset: 4 - * c-continued-brace-offset: 0 - * indent-tabs-mode: nil - * tab-width: 8 - * End: - */ diff -uprN linux-2.6.32/arch/s390/include/asm/debug.h linux-2.6.32.ovz/arch/s390/include/asm/debug.h --- linux-2.6.32/arch/s390/include/asm/debug.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/debug.h 2015-03-10 13:23:23.000000000 -0700 @@ -131,6 +131,7 @@ void debug_unregister(debug_info_t* id); void debug_set_level(debug_info_t* id, int new_level); +void debug_set_critical(void); void debug_stop_all(void); static inline debug_entry_t* diff -uprN linux-2.6.32/arch/s390/include/asm/diag.h linux-2.6.32.ovz/arch/s390/include/asm/diag.h --- linux-2.6.32/arch/s390/include/asm/diag.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/diag.h 2015-03-10 13:23:04.000000000 -0700 @@ -9,9 +9,22 @@ #define _ASM_S390_DIAG_H /* - * Diagnose 10: Release pages + * Diagnose 10: Release page range */ -extern void diag10(unsigned long addr); +static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn) +{ + unsigned long start_addr, end_addr; + + start_addr = start_pfn << PAGE_SHIFT; + end_addr = (start_pfn + num_pfn - 1) << PAGE_SHIFT; + + asm volatile( + "0: diag %0,%1,0x10\n" + "1:\n" + EX_TABLE(0b, 1b) + EX_TABLE(1b, 1b) + : : "a" (start_addr), "a" (end_addr)); +} /* * Diagnose 14: Input spool file manipulation diff -uprN linux-2.6.32/arch/s390/include/asm/eadm.h linux-2.6.32.ovz/arch/s390/include/asm/eadm.h --- linux-2.6.32/arch/s390/include/asm/eadm.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/eadm.h 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,130 @@ +#ifndef _ASM_S390_EADM_H +#define _ASM_S390_EADM_H + +#include +#include +#include + +struct arqb { + u64 data; + u16 fmt:4; + u16:12; + u16 cmd_code; + u16:16; + u16 msb_count; + u32 reserved[12]; +} __packed; + +#define ARQB_CMD_MOVE 1 + +struct arsb { + u16 fmt:4; + u32:28; + u8 ef; + u8:8; + u8 ecbi; + u8:8; + u8 fvf; + u16:16; + u8 eqc; + u32:32; + u64 fail_msb; + u64 fail_aidaw; + u64 fail_ms; + u64 fail_scm; + u32 reserved[4]; +} __packed; + +#define EQC_WR_PROHIBIT 22 + +struct msb { + u8 fmt:4; + u8 oc:4; + u8 flags; + u16:12; + u16 bs:4; + u32 blk_count; + u64 data_addr; + u64 scm_addr; + u64:64; +} __packed; + +struct aidaw { + u8 flags; + u32 :24; + u32 :32; + u64 data_addr; +} __packed; + +#define MSB_OC_CLEAR 0 +#define MSB_OC_READ 1 +#define MSB_OC_WRITE 2 +#define MSB_OC_RELEASE 3 + +#define MSB_FLAG_BNM 0x80 +#define MSB_FLAG_IDA 0x40 + +#define MSB_BS_4K 0 +#define MSB_BS_1M 1 + +#define AOB_NR_MSB 124 + +struct aob { + struct arqb request; + struct arsb response; + struct msb msb[AOB_NR_MSB]; +} __packed __aligned(PAGE_SIZE); + +struct aob_rq_header { + struct scm_device *scmdev; + char data[0]; +}; + +struct scm_device { + u64 address; + u64 size; + unsigned int nr_max_block; + struct device dev; + spinlock_t lock; + struct { + unsigned int persistence:4; + unsigned int oper_state:4; + unsigned int data_state:4; + unsigned int rank:4; + unsigned int release:1; + unsigned int res_id:8; + } __packed attrs; +}; + +#define OP_STATE_GOOD 1 +#define OP_STATE_TEMP_ERR 2 +#define OP_STATE_PERM_ERR 3 + +enum scm_event {SCM_CHANGE, SCM_AVAIL}; + +struct scm_driver { + struct device_driver drv; + int (*probe) (struct scm_device *scmdev); + int (*remove) (struct scm_device *scmdev); + void (*notify) (struct scm_device *scmdev, enum scm_event event); + void (*handler) (struct scm_device *scmdev, void *data, int error); +}; + +int scm_driver_register(struct scm_driver *scmdrv); +void scm_driver_unregister(struct scm_driver *scmdrv); + +int scm_start_aob(struct aob *aob); +void scm_irq_handler(struct aob *aob, int error); + +struct eadm_ops { + int (*eadm_start) (struct aob *aob); + struct module *owner; +}; + +int scm_get_ref(void); +void scm_put_ref(void); + +void register_eadm_ops(struct eadm_ops *ops); +void unregister_eadm_ops(struct eadm_ops *ops); + +#endif /* _ASM_S390_EADM_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/elf.h linux-2.6.32.ovz/arch/s390/include/asm/elf.h --- linux-2.6.32/arch/s390/include/asm/elf.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/elf.h 2015-03-10 13:23:48.000000000 -0700 @@ -103,6 +103,7 @@ #define HWCAP_S390_HPAGE 128 #define HWCAP_S390_ETF3EH 256 #define HWCAP_S390_HIGH_GPRS 512 +#define HWCAP_S390_TE 1024 /* * These are used to set parameters in the core dumps. @@ -162,7 +163,9 @@ extern unsigned int vdso_enabled; use of this is to invoke "./ld.so someprog" to test out a new version of the loader. We need to make sure that it is out of the way of the program that it will "exec", and that there is sufficient room for the brk. */ -#define ELF_ET_DYN_BASE (STACK_TOP / 3 * 2) + +extern unsigned long randomize_et_dyn(unsigned long base); +#define ELF_ET_DYN_BASE (randomize_et_dyn(STACK_TOP / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. */ @@ -207,6 +210,8 @@ do { \ current->mm->context.noexec == 0; \ }) +#define STACK_RND_MASK 0x7ffUL + #define ARCH_DLINFO \ do { \ if (vdso_enabled) \ @@ -219,4 +224,7 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 int arch_setup_additional_pages(struct linux_binprm *, int); +extern unsigned long arch_randomize_brk(struct mm_struct *mm); +#define arch_randomize_brk arch_randomize_brk + #endif diff -uprN linux-2.6.32/arch/s390/include/asm/hardirq.h linux-2.6.32.ovz/arch/s390/include/asm/hardirq.h --- linux-2.6.32/arch/s390/include/asm/hardirq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/hardirq.h 2015-03-10 13:22:05.000000000 -0700 @@ -15,7 +15,6 @@ #include #include #include -#include #include #define local_softirq_pending() (S390_lowcore.softirq_pending) diff -uprN linux-2.6.32/arch/s390/include/asm/hugetlb.h linux-2.6.32.ovz/arch/s390/include/asm/hugetlb.h --- linux-2.6.32/arch/s390/include/asm/hugetlb.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/hugetlb.h 2015-03-10 13:23:43.000000000 -0700 @@ -92,15 +92,6 @@ static inline pte_t huge_ptep_get(pte_t return pte; } -static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - pte_t pte = huge_ptep_get(ptep); - - pmd_clear((pmd_t *) ptep); - return pte; -} - static inline void __pmd_csp(pmd_t *pmdp) { register unsigned long reg2 asm("2") = pmd_val(*pmdp); @@ -153,6 +144,15 @@ static inline void huge_ptep_invalidate( return; } +static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t pte = huge_ptep_get(ptep); + + huge_ptep_invalidate(mm, addr, ptep); + return pte; +} + #define huge_ptep_set_access_flags(__vma, __addr, __ptep, __entry, __dirty) \ ({ \ int __changed = !pte_same(huge_ptep_get(__ptep), __entry); \ @@ -167,9 +167,7 @@ static inline void huge_ptep_invalidate( ({ \ pte_t __pte = huge_ptep_get(__ptep); \ if (pte_write(__pte)) { \ - if (atomic_read(&(__mm)->mm_users) > 1 || \ - (__mm) != current->active_mm) \ - huge_ptep_invalidate(__mm, __addr, __ptep); \ + huge_ptep_invalidate(__mm, __addr, __ptep); \ set_huge_pte_at(__mm, __addr, __ptep, \ huge_pte_wrprotect(__pte)); \ } \ diff -uprN linux-2.6.32/arch/s390/include/asm/io.h linux-2.6.32.ovz/arch/s390/include/asm/io.h --- linux-2.6.32/arch/s390/include/asm/io.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/io.h 2015-03-10 13:23:55.000000000 -0700 @@ -38,11 +38,8 @@ static inline void * phys_to_virt(unsign return (void *) address; } -/* - * Convert a physical pointer to a virtual kernel pointer for /dev/mem - * access - */ -#define xlate_dev_mem_ptr(p) __va(p) +void *xlate_dev_mem_ptr(unsigned long phys); +void unxlate_dev_mem_ptr(unsigned long phys, void *addr); /* * Convert a virtual cached pointer to an uncached pointer diff -uprN linux-2.6.32/arch/s390/include/asm/ipl.h linux-2.6.32.ovz/arch/s390/include/asm/ipl.h --- linux-2.6.32/arch/s390/include/asm/ipl.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/ipl.h 2015-03-10 13:23:23.000000000 -0700 @@ -88,6 +88,8 @@ extern u32 ipl_flags; extern u32 dump_prefix_page; extern unsigned int zfcpdump_prefix_array[]; +extern unsigned long long elfcorehdr_size; + extern void do_reipl(void); extern void do_halt(void); extern void do_poff(void); @@ -167,5 +169,7 @@ enum diag308_rc { }; extern int diag308(unsigned long subcode, void *addr); +extern void store_status(void); +extern void lgr_info_log(void); #endif /* _ASM_S390_IPL_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/isc.h linux-2.6.32.ovz/arch/s390/include/asm/isc.h --- linux-2.6.32/arch/s390/include/asm/isc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/isc.h 2015-03-10 13:23:44.000000000 -0700 @@ -14,6 +14,7 @@ /* Regular I/O interrupts. */ #define IO_SCH_ISC 3 /* regular I/O subchannels */ #define CONSOLE_ISC 1 /* console I/O subchannel */ +#define EADM_SCH_ISC 4 /* EADM subchannels */ #define CHSC_SCH_ISC 7 /* CHSC subchannels */ /* Adapter interrupts. */ #define QDIO_AIRQ_ISC IO_SCH_ISC /* I/O subchannel in qdio mode */ diff -uprN linux-2.6.32/arch/s390/include/asm/Kbuild linux-2.6.32.ovz/arch/s390/include/asm/Kbuild --- linux-2.6.32/arch/s390/include/asm/Kbuild 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/Kbuild 2015-03-10 13:24:09.000000000 -0700 @@ -9,6 +9,8 @@ header-y += vtoc.h header-y += zcrypt.h header-y += chsc.h +generic-y += trace_clock.h + unifdef-y += cmb.h unifdef-y += debug.h unifdef-y += chpid.h diff -uprN linux-2.6.32/arch/s390/include/asm/kexec.h linux-2.6.32.ovz/arch/s390/include/asm/kexec.h --- linux-2.6.32/arch/s390/include/asm/kexec.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/kexec.h 2015-03-10 13:23:14.000000000 -0700 @@ -30,14 +30,41 @@ /* Not more than 2GB */ #define KEXEC_CONTROL_MEMORY_LIMIT (1UL<<31) +/* Maximum address we can use for the crash control pages */ +#define KEXEC_CRASH_CONTROL_MEMORY_LIMIT (-1UL) + /* Allocate one page for the pdp and the second for the code */ #define KEXEC_CONTROL_PAGE_SIZE 4096 +/* Alignment of crashkernel memory */ +#define KEXEC_CRASH_MEM_ALIGN HPAGE_SIZE + /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 +/* + * Size for s390x ELF notes per CPU + * + * Seven notes plus zero note at the end: prstatus, fpregset, timer, + * tod_cmp, tod_reg, control regs, and prefix + */ +#define KEXEC_NOTE_BYTES \ + (ALIGN(sizeof(struct elf_note), 4) * 8 + \ + ALIGN(sizeof("CORE"), 4) * 7 + \ + ALIGN(sizeof(struct elf_prstatus), 4) + \ + ALIGN(sizeof(elf_fpregset_t), 4) + \ + ALIGN(sizeof(u64), 4) + \ + ALIGN(sizeof(u64), 4) + \ + ALIGN(sizeof(u32), 4) + \ + ALIGN(sizeof(u64) * 16, 4) + \ + ALIGN(sizeof(u32), 4) \ + ) + /* Provide a dummy definition to avoid build failures. */ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { } +#define KEXEC_AUTO_RESERVED_SIZE (1ULL<<27) /* 128M */ +#define KEXEC_AUTO_THRESHOLD (1ULL<<32) /* 4G */ + #endif /*_S390_KEXEC_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/kvm.h linux-2.6.32.ovz/arch/s390/include/asm/kvm.h --- linux-2.6.32/arch/s390/include/asm/kvm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/kvm.h 2015-03-10 13:21:03.000000000 -0700 @@ -1,6 +1,5 @@ #ifndef __LINUX_KVM_S390_H #define __LINUX_KVM_S390_H - /* * asm-s390/kvm.h - KVM s390 specific structures and definitions * @@ -15,6 +14,8 @@ */ #include +#define __KVM_S390 + /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { /* general purpose regs for s390 */ diff -uprN linux-2.6.32/arch/s390/include/asm/kvm_host.h linux-2.6.32.ovz/arch/s390/include/asm/kvm_host.h --- linux-2.6.32/arch/s390/include/asm/kvm_host.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/kvm_host.h 2015-03-10 13:24:00.000000000 -0700 @@ -144,6 +144,7 @@ struct kvm_vcpu_stat { u32 instruction_sigp_prefix; u32 instruction_sigp_restart; u32 diagnose_44; + u32 diagnose_9c; }; struct kvm_s390_io_info { diff -uprN linux-2.6.32/arch/s390/include/asm/kvm_para.h linux-2.6.32.ovz/arch/s390/include/asm/kvm_para.h --- linux-2.6.32/arch/s390/include/asm/kvm_para.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/kvm_para.h 2015-03-10 13:23:40.000000000 -0700 @@ -149,6 +149,11 @@ static inline unsigned int kvm_arch_para return 0; } +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} + #endif #endif /* __S390_KVM_PARA_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/local64.h linux-2.6.32.ovz/arch/s390/include/asm/local64.h --- linux-2.6.32/arch/s390/include/asm/local64.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/local64.h 2015-03-10 13:22:06.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/s390/include/asm/lowcore.h linux-2.6.32.ovz/arch/s390/include/asm/lowcore.h --- linux-2.6.32/arch/s390/include/asm/lowcore.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/lowcore.h 2015-03-10 13:23:48.000000000 -0700 @@ -69,6 +69,7 @@ #define __LC_INT_CLOCK 0x02c8 #define __LC_MACHINE_FLAGS 0x02d8 #define __LC_FTRACE_FUNC 0x02dc +#define __LC_CURRENT_PID 0x02f0 #define __LC_IRB 0x0300 #define __LC_PFAULT_INTPARM 0x0080 #define __LC_CPU_TIMER_SAVE_AREA 0x00d8 @@ -116,6 +117,9 @@ #define __LC_VDSO_PER_CPU 0x0350 #define __LC_MACHINE_FLAGS 0x0358 #define __LC_FTRACE_FUNC 0x0360 +#define __LC_SIE_HOOK 0x0368 +#define __LC_CMF_HPP 0x0370 +#define __LC_CURRENT_PID 0x0378 #define __LC_IRB 0x0380 #define __LC_PASTE 0x03c0 #define __LC_PFAULT_INTPARM 0x11b8 @@ -143,6 +147,7 @@ void system_call(void); void pgm_check_handler(void); void mcck_int_handler(void); void io_int_handler(void); +void psw_restart_int_handler(void); struct save_area_s390 { u32 ext_save; @@ -293,7 +298,8 @@ struct _lowcore __u64 clock_comparator; /* 0x02d0 */ __u32 machine_flags; /* 0x02d8 */ __u32 ftrace_func; /* 0x02dc */ - __u8 pad_0x02f0[0x0300-0x02f0]; /* 0x02f0 */ + __u32 current_pid; /* 0x02f0 */ + __u8 pad_0x02f4[0x0300-0x02f4]; /* 0x02f4 */ /* Interrupt response block */ __u8 irb[64]; /* 0x0300 */ @@ -307,9 +313,10 @@ struct _lowcore */ __u32 ipib; /* 0x0e00 */ __u32 ipib_checksum; /* 0x0e04 */ + __u32 vmcore_info; /* 0x0e08 */ /* Align to the top 1k of prefix area */ - __u8 pad_0x0e08[0x1000-0x0e08]; /* 0x0e08 */ + __u8 pad_0x0e0c[0x1000-0x0e0c]; /* 0x0e0c */ #else /* !__s390x__ */ /* 0x0000 - 0x01ff: defined by architecture */ __u32 ccw1[2]; /* 0x0000 */ @@ -399,7 +406,9 @@ struct _lowcore __u64 vdso_per_cpu_data; /* 0x0350 */ __u64 machine_flags; /* 0x0358 */ __u64 ftrace_func; /* 0x0360 */ - __u8 pad_0x0368[0x0380-0x0368]; /* 0x0368 */ + __u64 sie_hook; /* 0x0368 */ + __u64 cmf_hpp; /* 0x0370 */ + __u64 current_pid; /* 0x0378 */ /* Interrupt response block. */ __u8 irb[64]; /* 0x0380 */ @@ -416,7 +425,8 @@ struct _lowcore */ __u64 ipib; /* 0x0e00 */ __u32 ipib_checksum; /* 0x0e08 */ - __u8 pad_0x0e0c[0x11b8-0x0e0c]; /* 0x0e0c */ + __u64 vmcore_info; /* 0x0e0c */ + __u8 pad_0x0e14[0x11b8-0x0e14]; /* 0x0e14 */ /* 64 bit extparam used for pfault/diag 250: defined by architecture */ __u64 ext_params2; /* 0x11B8 */ @@ -436,9 +446,13 @@ struct _lowcore __u8 pad_0x1338[0x1340-0x1338]; /* 0x1338 */ __u32 access_regs_save_area[16]; /* 0x1340 */ __u64 cregs_save_area[16]; /* 0x1380 */ + __u8 pad_0x1400[0x1800-0x1400]; /* 0x1400 */ + + /* Transaction abort diagnostic block */ + __u8 pgm_tdb[256]; /* 0x1800 */ /* align to the top of the prefix area */ - __u8 pad_0x1400[0x2000-0x1400]; /* 0x1400 */ + __u8 pad_0x1900[0x2000-0x1900]; /* 0x1900 */ #endif /* !__s390x__ */ } __attribute__((packed)); /* End structure*/ diff -uprN linux-2.6.32/arch/s390/include/asm/mman.h linux-2.6.32.ovz/arch/s390/include/asm/mman.h --- linux-2.6.32/arch/s390/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/mman.h 2015-03-10 13:23:53.000000000 -0700 @@ -12,8 +12,8 @@ #include #if defined(__KERNEL__) && !defined(__ASSEMBLY__) && defined(CONFIG_64BIT) -int s390_mmap_check(unsigned long addr, unsigned long len); -#define arch_mmap_check(addr,len,flags) s390_mmap_check(addr,len) +int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags); +#define arch_mmap_check(addr, len, flags) s390_mmap_check(addr, len, flags) #endif #endif /* __S390_MMAN_H__ */ diff -uprN linux-2.6.32/arch/s390/include/asm/mmu_context.h linux-2.6.32.ovz/arch/s390/include/asm/mmu_context.h --- linux-2.6.32/arch/s390/include/asm/mmu_context.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/mmu_context.h 2015-03-10 13:23:22.000000000 -0700 @@ -11,11 +11,13 @@ #include #include -#include +#include static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + atomic_set(&mm->context.attach_count, 0); + mm->context.flush_mm = 0; mm->context.asce_bits = _ASCE_TABLE_LENGTH | _ASCE_USER_BITS; #ifdef CONFIG_64BIT mm->context.asce_bits |= _ASCE_TYPE_REGION3; @@ -36,7 +38,7 @@ static inline int init_new_context(struc mm->context.has_pgste = 1; mm->context.alloc_pgste = 1; } else { - mm->context.noexec = s390_noexec; + mm->context.noexec = (user_mode == SECONDARY_SPACE_MODE); mm->context.has_pgste = 0; mm->context.alloc_pgste = 0; } @@ -58,7 +60,7 @@ static inline void update_mm(struct mm_s pgd_t *pgd = mm->pgd; S390_lowcore.user_asce = mm->context.asce_bits | __pa(pgd); - if (switch_amode) { + if (user_mode != HOME_SPACE_MODE) { /* Load primary space page table origin. */ pgd = mm->context.noexec ? get_shadow_table(pgd) : pgd; S390_lowcore.user_exec_asce = mm->context.asce_bits | __pa(pgd); @@ -76,6 +78,12 @@ static inline void switch_mm(struct mm_s { cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); update_mm(next, tsk); + atomic_dec(&prev->context.attach_count); + WARN_ON(atomic_read(&prev->context.attach_count) < 0); + atomic_inc(&next->context.attach_count); + /* Check for TLBs not flushed yet */ + if (next->context.flush_mm) + __tlb_flush_mm(next); } #define enter_lazy_tlb(mm,tsk) do { } while (0) @@ -87,4 +95,15 @@ static inline void activate_mm(struct mm switch_mm(prev, next, current); } +static inline void arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) +{ + if (oldmm->context.asce_limit < mm->context.asce_limit) + crst_table_downgrade(mm, oldmm->context.asce_limit); +} + +static inline void arch_exit_mmap(struct mm_struct *mm) +{ +} + #endif /* __S390_MMU_CONTEXT_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/mmu.h linux-2.6.32.ovz/arch/s390/include/asm/mmu.h --- linux-2.6.32/arch/s390/include/asm/mmu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/mmu.h 2015-03-10 13:21:44.000000000 -0700 @@ -2,6 +2,8 @@ #define __MMU_H typedef struct { + atomic_t attach_count; + unsigned int flush_mm; spinlock_t list_lock; struct list_head crst_list; struct list_head pgtable_list; diff -uprN linux-2.6.32/arch/s390/include/asm/module.h linux-2.6.32.ovz/arch/s390/include/asm/module.h --- linux-2.6.32/arch/s390/include/asm/module.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/module.h 2015-03-10 13:20:58.000000000 -0700 @@ -29,14 +29,17 @@ struct mod_arch_specific }; #ifdef __s390x__ +#define MODULES_ARE_ELF64 #define ElfW(x) Elf64_ ## x #define ELFW(x) ELF64_ ## x #else +#define MODULES_ARE_ELF32 #define ElfW(x) Elf32_ ## x #define ELFW(x) ELF32_ ## x #endif #define Elf_Addr ElfW(Addr) +#define Elf_Rel ElfW(Rel) #define Elf_Rela ElfW(Rela) #define Elf_Shdr ElfW(Shdr) #define Elf_Sym ElfW(Sym) diff -uprN linux-2.6.32/arch/s390/include/asm/mutex.h linux-2.6.32.ovz/arch/s390/include/asm/mutex.h --- linux-2.6.32/arch/s390/include/asm/mutex.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/mutex.h 2015-03-10 13:23:03.000000000 -0700 @@ -7,3 +7,5 @@ */ #include + +#define arch_mutex_cpu_relax() barrier() diff -uprN linux-2.6.32/arch/s390/include/asm/page.h linux-2.6.32.ovz/arch/s390/include/asm/page.h --- linux-2.6.32/arch/s390/include/asm/page.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/page.h 2015-03-10 13:22:07.000000000 -0700 @@ -129,6 +129,11 @@ struct page; void arch_free_page(struct page *page, int order); void arch_alloc_page(struct page *page, int order); +static inline int devmem_is_allowed(unsigned long pfn) +{ + return 0; +} + #define HAVE_ARCH_FREE_PAGE #define HAVE_ARCH_ALLOC_PAGE diff -uprN linux-2.6.32/arch/s390/include/asm/perf_event.h linux-2.6.32.ovz/arch/s390/include/asm/perf_event.h --- linux-2.6.32/arch/s390/include/asm/perf_event.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/perf_event.h 2015-03-10 13:23:55.000000000 -0700 @@ -1,10 +1,16 @@ /* * Performance event support - s390 specific definitions. * - * Copyright 2009 Martin Schwidefsky, IBM Corporation. + * Copyright IBM Corp. 2009, 2012 + * Author(s): Martin Schwidefsky + * Hendrik Brueckner */ -static inline void set_perf_event_pending(void) {} -static inline void clear_perf_event_pending(void) {} +#include -#define PERF_EVENT_INDEX_OFFSET 0 +/* CPU-measurement counter facility */ +#define PERF_CPUM_CF_MAX_CTR 256 + +/* Per-CPU flags for PMU states */ +#define PMU_F_RESERVED 0x1000 +#define PMU_F_ENABLED 0x2000 diff -uprN linux-2.6.32/arch/s390/include/asm/pgalloc.h linux-2.6.32.ovz/arch/s390/include/asm/pgalloc.h --- linux-2.6.32/arch/s390/include/asm/pgalloc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/pgalloc.h 2015-03-10 13:20:59.000000000 -0700 @@ -143,7 +143,8 @@ static inline pgd_t *pgd_alloc(struct mm spin_lock_init(&mm->context.list_lock); INIT_LIST_HEAD(&mm->context.crst_list); INIT_LIST_HEAD(&mm->context.pgtable_list); - return (pgd_t *) crst_table_alloc(mm, s390_noexec); + return (pgd_t *) + crst_table_alloc(mm, user_mode == SECONDARY_SPACE_MODE); } #define pgd_free(mm, pgd) crst_table_free(mm, (unsigned long *) pgd) diff -uprN linux-2.6.32/arch/s390/include/asm/pgtable.h linux-2.6.32.ovz/arch/s390/include/asm/pgtable.h --- linux-2.6.32/arch/s390/include/asm/pgtable.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/pgtable.h 2015-03-10 13:24:14.000000000 -0700 @@ -105,35 +105,18 @@ extern char empty_zero_page[PAGE_SIZE]; #ifndef __ASSEMBLY__ /* * The vmalloc area will always be on the topmost area of the kernel - * mapping. We reserve 96MB (31bit) / 1GB (64bit) for vmalloc, + * mapping. We reserve 96MB (31bit) / 128GB (64bit) for vmalloc, * which should be enough for any sane case. * By putting vmalloc at the top, we maximise the gap between physical * memory and vmalloc to catch misplaced memory accesses. As a side * effect, this also makes sure that 64 bit module code cannot be used * as system call address. */ - extern unsigned long VMALLOC_START; +extern unsigned long VMALLOC_END; +extern struct page *vmemmap; -#ifndef __s390x__ -#define VMALLOC_SIZE (96UL << 20) -#define VMALLOC_END 0x7e000000UL -#define VMEM_MAP_END 0x80000000UL -#else /* __s390x__ */ -#define VMALLOC_SIZE (1UL << 30) -#define VMALLOC_END 0x3e040000000UL -#define VMEM_MAP_END 0x40000000000UL -#endif /* __s390x__ */ - -/* - * VMEM_MAX_PHYS is the highest physical address that can be added to the 1:1 - * mapping. This needs to be calculated at compile time since the size of the - * VMEM_MAP is static but the size of struct page can change. - */ -#define VMEM_MAX_PAGES ((VMEM_MAP_END - VMALLOC_END) / sizeof(struct page)) -#define VMEM_MAX_PFN min(VMALLOC_START >> PAGE_SHIFT, VMEM_MAX_PAGES) -#define VMEM_MAX_PHYS ((VMEM_MAX_PFN << PAGE_SHIFT) & ~((16 << 20) - 1)) -#define vmemmap ((struct page *) VMALLOC_END) +#define VMEM_MAX_PHYS ((unsigned long) vmemmap) /* * A 31 bit pagetable entry of S390 has following format: @@ -878,7 +861,8 @@ static inline void ptep_invalidate(struc #define ptep_get_and_clear(__mm, __address, __ptep) \ ({ \ pte_t __pte = *(__ptep); \ - if (atomic_read(&(__mm)->mm_users) > 1 || \ + (__mm)->context.flush_mm = 1; \ + if (atomic_read(&(__mm)->context.attach_count) > 1 || \ (__mm) != current->active_mm) \ ptep_invalidate(__mm, __address, __ptep); \ else \ @@ -921,7 +905,8 @@ static inline pte_t ptep_get_and_clear_f ({ \ pte_t __pte = *(__ptep); \ if (pte_write(__pte)) { \ - if (atomic_read(&(__mm)->mm_users) > 1 || \ + (__mm)->context.flush_mm = 1; \ + if (atomic_read(&(__mm)->context.attach_count) > 1 || \ (__mm) != current->active_mm) \ ptep_invalidate(__mm, __addr, __ptep); \ set_pte_at(__mm, __addr, __ptep, pte_wrprotect(__pte)); \ @@ -1117,6 +1102,10 @@ static inline pte_t mk_swap_pte(unsigned ((pte_t) { ((((__off) & 0x7f) << 1) + (((__off) >> 7) << 12)) \ | _PAGE_TYPE_FILE }) +/* TODO: s390 cannot support io_remap_pfn_range... */ +#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + remap_pfn_range(vma, vaddr, pfn, size, prot) + #endif /* !__ASSEMBLY__ */ #define kern_addr_valid(addr) (1) diff -uprN linux-2.6.32/arch/s390/include/asm/processor.h linux-2.6.32.ovz/arch/s390/include/asm/processor.h --- linux-2.6.32/arch/s390/include/asm/processor.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/processor.h 2015-03-10 13:24:05.000000000 -0700 @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef __KERNEL__ /* @@ -86,8 +87,19 @@ struct thread_struct { unsigned long ieee_instruction_pointer; /* pfault_wait is used to block the process on a pfault event */ unsigned long pfault_wait; +#ifndef __GENKSYMS__ + /* cpu runtime instrumentation */ + struct runtime_instr_cb *ri_cb; + int ri_signum; + unsigned long per_flags; /* Flags to control debug behavior */ +#ifdef CONFIG_64BIT + unsigned char trap_tdb[256]; /* Transaction abort diagnose block */ +#endif +#endif }; +#define PER_FLAG_NO_TE 1UL /* Flag to disable transactions. */ + typedef struct thread_struct thread_struct; /* @@ -150,11 +162,6 @@ extern int kernel_thread(int (*fn)(void */ extern unsigned long thread_saved_pc(struct task_struct *t); -/* - * Print register of task into buffer. Used in fs/proc/array.c. - */ -extern void task_show_regs(struct seq_file *m, struct task_struct *task); - extern void show_code(struct pt_regs *regs); unsigned long get_wchan(struct task_struct *p); diff -uprN linux-2.6.32/arch/s390/include/asm/ptrace.h linux-2.6.32.ovz/arch/s390/include/asm/ptrace.h --- linux-2.6.32/arch/s390/include/asm/ptrace.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/ptrace.h 2015-03-10 13:24:04.000000000 -0700 @@ -201,6 +201,9 @@ typedef union typedef struct { __u32 fpc; +#ifndef __GENKSYMS__ + __u32 pad; +#endif freg_t fprs[NUM_FPRS]; } s390_fp_regs; @@ -208,7 +211,6 @@ typedef struct #define FPC_FLAGS_MASK 0x00F80000 #define FPC_DXC_MASK 0x0000FF00 #define FPC_RM_MASK 0x00000003 -#define FPC_VALID_MASK 0xF8F8FF03 /* this typedef defines how a Program Status Word looks like */ typedef struct @@ -236,6 +238,7 @@ typedef struct #define PSW_MASK_ASC 0x0000C000UL #define PSW_MASK_CC 0x00003000UL #define PSW_MASK_PM 0x00000F00UL +#define PSW_MASK_RI 0x00000000UL #define PSW_ADDR_AMODE 0x80000000UL #define PSW_ADDR_INSN 0x7FFFFFFFUL @@ -261,6 +264,7 @@ typedef struct #define PSW_MASK_ASC 0x0000C00000000000UL #define PSW_MASK_CC 0x0000300000000000UL #define PSW_MASK_PM 0x00000F0000000000UL +#define PSW_MASK_RI 0x0000008000000000UL #define PSW_ADDR_AMODE 0x0000000000000000UL #define PSW_ADDR_INSN 0xFFFFFFFFFFFFFFFFUL @@ -290,7 +294,7 @@ extern long psw_user32_bits; is the condition code and the program mask bits. */ #define PSW_MASK_MERGE(CURRENT,NEW) \ (((CURRENT) & ~(PSW_MASK_CC|PSW_MASK_PM)) | \ - ((NEW) & (PSW_MASK_CC|PSW_MASK_PM))) + ((NEW) & (PSW_MASK_CC|PSW_MASK_PM|PSW_MASK_RI))) /* * The s390_regs structure is used to define the elf_gregset_t. @@ -341,7 +345,7 @@ typedef struct unsigned long cr[NUM_CR_WORDS]; } per_cr_words; -#define PER_EM_MASK 0xE8000000UL +#define PER_EM_MASK 0xEB000000UL typedef struct { @@ -357,9 +361,17 @@ typedef struct unsigned em_storage_alteration : 1; unsigned em_gpr_alt_unused : 1; unsigned em_store_real_address : 1; +#ifdef __GENKSYMS__ unsigned : 3; unsigned branch_addr_ctl : 1; unsigned : 1; +#else + unsigned : 1; + unsigned em_transaction_end : 1; + unsigned em_nullification : 1; + unsigned branch_addr_ctl : 1; + unsigned suspension_ctl : 1; +#endif unsigned storage_alt_space_ctl : 1; unsigned : 21; unsigned long starting_addr; @@ -436,6 +448,9 @@ typedef struct #define PTRACE_PEEKDATA_AREA 0x5003 #define PTRACE_POKETEXT_AREA 0x5004 #define PTRACE_POKEDATA_AREA 0x5005 +#define PTRACE_GET_LAST_BREAK 0x5006 +#define PTRACE_ENABLE_TE 0x5009 +#define PTRACE_DISABLE_TE 0x5010 /* * PT_PROT definition is loosely based on hppa bsd definition in @@ -496,8 +511,13 @@ extern void user_disable_single_step(str #define user_mode(regs) (((regs)->psw.mask & PSW_MASK_PSTATE) != 0) #define instruction_pointer(regs) ((regs)->psw.addr & PSW_ADDR_INSN) #define user_stack_pointer(regs)((regs)->gprs[15]) -#define regs_return_value(regs)((regs)->gprs[2]) #define profile_pc(regs) instruction_pointer(regs) + +static inline long regs_return_value(struct pt_regs *regs) +{ + return regs->gprs[2]; +} + extern void show_regs(struct pt_regs * regs); #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff -uprN linux-2.6.32/arch/s390/include/asm/qdio.h linux-2.6.32.ovz/arch/s390/include/asm/qdio.h --- linux-2.6.32/arch/s390/include/asm/qdio.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/qdio.h 2015-03-10 13:24:09.000000000 -0700 @@ -13,7 +13,8 @@ #include #include -#define QDIO_MAX_QUEUES_PER_IRQ 32 +/* only use 4 queues to save some cachelines */ +#define QDIO_MAX_QUEUES_PER_IRQ 4 #define QDIO_MAX_BUFFERS_PER_Q 128 #define QDIO_MAX_BUFFERS_MASK (QDIO_MAX_BUFFERS_PER_Q - 1) #define QDIO_MAX_ELEMENTS_PER_BUFFER 16 @@ -45,6 +46,8 @@ struct qdesfmt0 { u32 : 16; } __attribute__ ((packed)); +#define QDR_AC_MULTI_BUFFER_ENABLE 0x01 + /** * struct qdr - queue description record (QDR) * @qfmt: queue format @@ -83,6 +86,7 @@ struct qdr { #define QIB_AC_OUTBOUND_PCI_SUPPORTED 0x40 #define QIB_RFLAGS_ENABLE_QEBSM 0x80 +#define QIB_RFLAGS_ENABLE_DATA_DIV 0x02 /** * struct qib - queue information block (QIB) @@ -120,6 +124,25 @@ struct slibe { u64 parms; }; + +struct qaob { + u64 res0[6]; + u8 res1; + u8 res2; + u8 res3; + u8 aorc; + u8 flags; + u16 cbtbs; + u8 sb_count; + u64 sba[QDIO_MAX_ELEMENTS_PER_BUFFER]; + u16 dcount[QDIO_MAX_ELEMENTS_PER_BUFFER]; + u64 user0; + u64 res4[2]; + u64 user1; + u64 user2; +} __attribute__ ((packed, aligned(256))); + + /** * struct slib - storage list information block (SLIB) * @nsliba: next SLIB address (if any) @@ -137,110 +160,47 @@ struct slib { struct slibe slibe[QDIO_MAX_BUFFERS_PER_Q]; } __attribute__ ((packed, aligned(2048))); -/** - * struct sbal_flags - storage block address list flags - * @last: last entry - * @cont: contiguous storage - * @frag: fragmentation - */ -struct sbal_flags { - u8 : 1; - u8 last : 1; - u8 cont : 1; - u8 : 1; - u8 frag : 2; - u8 : 2; -} __attribute__ ((packed)); - -#define SBAL_FLAGS_FIRST_FRAG 0x04000000UL -#define SBAL_FLAGS_MIDDLE_FRAG 0x08000000UL -#define SBAL_FLAGS_LAST_FRAG 0x0c000000UL -#define SBAL_FLAGS_LAST_ENTRY 0x40000000UL -#define SBAL_FLAGS_CONTIGUOUS 0x20000000UL +#define SBAL_EFLAGS_LAST_ENTRY 0x40 +#define SBAL_EFLAGS_CONTIGUOUS 0x20 +#define SBAL_EFLAGS_FIRST_FRAG 0x04 +#define SBAL_EFLAGS_MIDDLE_FRAG 0x08 +#define SBAL_EFLAGS_LAST_FRAG 0x0c +#define SBAL_EFLAGS_MASK 0x6f -#define SBAL_FLAGS0_DATA_CONTINUATION 0x20UL +#define SBAL_SFLAGS0_PCI_REQ 0x40 +#define SBAL_SFLAGS0_DATA_CONTINUATION 0x20 /* Awesome OpenFCP extensions */ -#define SBAL_FLAGS0_TYPE_STATUS 0x00UL -#define SBAL_FLAGS0_TYPE_WRITE 0x08UL -#define SBAL_FLAGS0_TYPE_READ 0x10UL -#define SBAL_FLAGS0_TYPE_WRITE_READ 0x18UL -#define SBAL_FLAGS0_MORE_SBALS 0x04UL -#define SBAL_FLAGS0_COMMAND 0x02UL -#define SBAL_FLAGS0_LAST_SBAL 0x00UL -#define SBAL_FLAGS0_ONLY_SBAL SBAL_FLAGS0_COMMAND -#define SBAL_FLAGS0_MIDDLE_SBAL SBAL_FLAGS0_MORE_SBALS -#define SBAL_FLAGS0_FIRST_SBAL SBAL_FLAGS0_MORE_SBALS | SBAL_FLAGS0_COMMAND -#define SBAL_FLAGS0_PCI 0x40 - -/** - * struct sbal_sbalf_0 - sbal flags for sbale 0 - * @pci: PCI indicator - * @cont: data continuation - * @sbtype: storage-block type (FCP) - */ -struct sbal_sbalf_0 { - u8 : 1; - u8 pci : 1; - u8 cont : 1; - u8 sbtype : 2; - u8 : 3; -} __attribute__ ((packed)); - -/** - * struct sbal_sbalf_1 - sbal flags for sbale 1 - * @key: storage key - */ -struct sbal_sbalf_1 { - u8 : 4; - u8 key : 4; -} __attribute__ ((packed)); - -/** - * struct sbal_sbalf_14 - sbal flags for sbale 14 - * @erridx: error index - */ -struct sbal_sbalf_14 { - u8 : 4; - u8 erridx : 4; -} __attribute__ ((packed)); - -/** - * struct sbal_sbalf_15 - sbal flags for sbale 15 - * @reason: reason for error state - */ -struct sbal_sbalf_15 { - u8 reason; -} __attribute__ ((packed)); - -/** - * union sbal_sbalf - storage block address list flags - * @i0: sbalf0 - * @i1: sbalf1 - * @i14: sbalf14 - * @i15: sblaf15 - * @value: raw value - */ -union sbal_sbalf { - struct sbal_sbalf_0 i0; - struct sbal_sbalf_1 i1; - struct sbal_sbalf_14 i14; - struct sbal_sbalf_15 i15; - u8 value; -}; +#define SBAL_SFLAGS0_TYPE_STATUS 0x00 +#define SBAL_SFLAGS0_TYPE_WRITE 0x08 +#define SBAL_SFLAGS0_TYPE_READ 0x10 +#define SBAL_SFLAGS0_TYPE_WRITE_READ 0x18 +#define SBAL_SFLAGS0_MORE_SBALS 0x04 +#define SBAL_SFLAGS0_COMMAND 0x02 +#define SBAL_SFLAGS0_LAST_SBAL 0x00 +#define SBAL_SFLAGS0_ONLY_SBAL SBAL_SFLAGS0_COMMAND +#define SBAL_SFLAGS0_MIDDLE_SBAL SBAL_SFLAGS0_MORE_SBALS +#define SBAL_SFLAGS0_FIRST_SBAL (SBAL_SFLAGS0_MORE_SBALS | SBAL_SFLAGS0_COMMAND) /** * struct qdio_buffer_element - SBAL entry - * @flags: flags + * @eflags: SBAL entry flags + * @scount: SBAL count + * @sflags: whole SBAL flags * @length: length * @addr: address */ struct qdio_buffer_element { - u32 flags; + u8 eflags; + /* private: */ + u8 res1; + /* public: */ + u8 scount; + u8 sflags; u32 length; #ifdef CONFIG_32BIT /* private: */ - void *reserved; + void *res2; /* public: */ #endif void *addr; @@ -283,6 +243,42 @@ struct slsb { u8 val[QDIO_MAX_BUFFERS_PER_Q]; } __attribute__ ((packed, aligned(256))); +/** + * struct qdio_outbuf_state - SBAL related asynchronous operation information + * (for communication with upper layer programs) + * (only required for use with completion queues) + * @flags: flags indicating state of buffer + * @aob: pointer to QAOB used for the particular SBAL + * @user: pointer to upper layer program's state information related to SBAL + * (stored in user1 data of QAOB) + */ +struct qdio_outbuf_state { + u8 flags; + struct qaob *aob; + void *user; +}; + +#define QDIO_OUTBUF_STATE_FLAG_NONE 0x00 +#define QDIO_OUTBUF_STATE_FLAG_PENDING 0x01 + +#define CHSC_AC1_INITIATE_INPUTQ 0x80 + +/* qdio adapter-characteristics-1 flag */ +#define AC1_SIGA_INPUT_NEEDED 0x40 /* process input queues */ +#define AC1_SIGA_OUTPUT_NEEDED 0x20 /* process output queues */ +#define AC1_SIGA_SYNC_NEEDED 0x10 /* ask hypervisor to sync */ +#define AC1_AUTOMATIC_SYNC_ON_THININT 0x08 /* set by hypervisor */ +#define AC1_AUTOMATIC_SYNC_ON_OUT_PCI 0x04 /* set by hypervisor */ +#define AC1_SC_QEBSM_AVAILABLE 0x02 /* available for subchannel */ +#define AC1_SC_QEBSM_ENABLED 0x01 /* enabled for subchannel */ + +#define CHSC_AC3_FORMAT2_CQ_AVAILABLE 0x8000 + +#define CHSC_AC2_MULTI_BUFFER_AVAILABLE 0x0080 +#define CHSC_AC2_MULTI_BUFFER_ENABLED 0x0040 +#define CHSC_AC2_DATA_DIV_AVAILABLE 0x0010 +#define CHSC_AC2_DATA_DIV_ENABLED 0x0002 + struct qdio_ssqd_desc { u8 flags; u8:8; @@ -301,8 +297,7 @@ struct qdio_ssqd_desc { u64 sch_token; u8 mro; u8 mri; - u8:8; - u8 sbalic; + u16 qdioac3; u16:16; u8:8; u8 mmwc; @@ -336,33 +331,41 @@ typedef void qdio_handler_t(struct ccw_d * @adapter_name: name for the adapter * @qib_param_field_format: format for qib_parm_field * @qib_param_field: pointer to 128 bytes or NULL, if no param field + * @qib_rflags: rflags to set * @input_slib_elements: pointer to no_input_qs * 128 words of data or NULL * @output_slib_elements: pointer to no_output_qs * 128 words of data or NULL * @no_input_qs: number of input queues * @no_output_qs: number of output queues * @input_handler: handler to be called for input queues * @output_handler: handler to be called for output queues + * @queue_start_poll_array: polling handlers (one per input queue or NULL) * @int_parm: interruption parameter * @flags: initialization flags * @input_sbal_addr_array: address of no_input_qs * 128 pointers * @output_sbal_addr_array: address of no_output_qs * 128 pointers + * @output_sbal_state_array: no_output_qs * 128 state info (for CQ or NULL) */ struct qdio_initialize { struct ccw_device *cdev; unsigned char q_format; + unsigned char qdr_ac; unsigned char adapter_name[8]; unsigned int qib_param_field_format; unsigned char *qib_param_field; + unsigned char qib_rflags; unsigned long *input_slib_elements; unsigned long *output_slib_elements; unsigned int no_input_qs; unsigned int no_output_qs; qdio_handler_t *input_handler; qdio_handler_t *output_handler; + void (**queue_start_poll_array) (struct ccw_device *, int, + unsigned long); unsigned long int_parm; unsigned long flags; void **input_sbal_addr_array; void **output_sbal_addr_array; + struct qdio_outbuf_state *output_sbal_state_array; }; #define QDIO_STATE_INACTIVE 0x00000002 /* after qdio_cleanup */ @@ -374,14 +377,16 @@ struct qdio_initialize { #define QDIO_FLAG_SYNC_OUTPUT 0x02 #define QDIO_FLAG_PCI_OUT 0x10 -extern int qdio_initialize(struct qdio_initialize *); extern int qdio_allocate(struct qdio_initialize *); extern int qdio_establish(struct qdio_initialize *); extern int qdio_activate(struct ccw_device *); +extern void qdio_release_aob(struct qaob *); extern int do_QDIO(struct ccw_device *cdev, unsigned int callflags, int q_nr, unsigned int bufnr, unsigned int count); -extern int qdio_cleanup(struct ccw_device*, int); +extern int qdio_start_irq(struct ccw_device *, int); +extern int qdio_stop_irq(struct ccw_device *, int); +extern int qdio_get_next_buffers(struct ccw_device *, int, int *, int *); extern int qdio_shutdown(struct ccw_device*, int); extern int qdio_free(struct ccw_device *); extern int qdio_get_ssqd_desc(struct ccw_device *dev, struct qdio_ssqd_desc*); diff -uprN linux-2.6.32/arch/s390/include/asm/qeth.h linux-2.6.32.ovz/arch/s390/include/asm/qeth.h --- linux-2.6.32/arch/s390/include/asm/qeth.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/qeth.h 2015-03-10 13:23:03.000000000 -0700 @@ -28,39 +28,70 @@ struct qeth_arp_cache_entry { __u8 reserved2[32]; } __attribute__ ((packed)); +enum qeth_arp_ipaddrtype { + QETHARP_IP_ADDR_V4 = 1, + QETHARP_IP_ADDR_V6 = 2, +}; +struct qeth_arp_entrytype { + __u8 mac; + __u8 ip; +} __attribute__((packed)); + +#define QETH_QARP_MEDIASPECIFIC_BYTES 32 +#define QETH_QARP_MACADDRTYPE_BYTES 1 struct qeth_arp_qi_entry7 { - __u8 media_specific[32]; - __u8 macaddr_type; - __u8 ipaddr_type; + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; __u8 macaddr[6]; __u8 ipaddr[4]; } __attribute__((packed)); +struct qeth_arp_qi_entry7_ipv6 { + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; + __u8 macaddr[6]; + __u8 ipaddr[16]; +} __attribute__((packed)); + struct qeth_arp_qi_entry7_short { - __u8 macaddr_type; - __u8 ipaddr_type; + struct qeth_arp_entrytype type; __u8 macaddr[6]; __u8 ipaddr[4]; } __attribute__((packed)); +struct qeth_arp_qi_entry7_short_ipv6 { + struct qeth_arp_entrytype type; + __u8 macaddr[6]; + __u8 ipaddr[16]; +} __attribute__((packed)); + struct qeth_arp_qi_entry5 { - __u8 media_specific[32]; - __u8 macaddr_type; - __u8 ipaddr_type; + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; __u8 ipaddr[4]; } __attribute__((packed)); +struct qeth_arp_qi_entry5_ipv6 { + __u8 media_specific[QETH_QARP_MEDIASPECIFIC_BYTES]; + struct qeth_arp_entrytype type; + __u8 ipaddr[16]; +} __attribute__((packed)); + struct qeth_arp_qi_entry5_short { - __u8 macaddr_type; - __u8 ipaddr_type; + struct qeth_arp_entrytype type; __u8 ipaddr[4]; } __attribute__((packed)); +struct qeth_arp_qi_entry5_short_ipv6 { + struct qeth_arp_entrytype type; + __u8 ipaddr[16]; +} __attribute__((packed)); /* * can be set by user if no "media specific information" is wanted * -> saves a lot of space in user space buffer */ #define QETH_QARP_STRIP_ENTRIES 0x8000 +#define QETH_QARP_WITH_IPV6 0x4000 #define QETH_QARP_REQUEST_MASK 0x00ff /* data sent to user space as result of query arp ioctl */ diff -uprN linux-2.6.32/arch/s390/include/asm/reset.h linux-2.6.32.ovz/arch/s390/include/asm/reset.h --- linux-2.6.32/arch/s390/include/asm/reset.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/reset.h 2015-03-10 13:23:04.000000000 -0700 @@ -17,5 +17,5 @@ struct reset_call { extern void register_reset_call(struct reset_call *reset); extern void unregister_reset_call(struct reset_call *reset); -extern void s390_reset_system(void); +extern void s390_reset_system(void (*func)(void *), void *data); #endif /* _ASM_S390_RESET_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/runtime_instr.h linux-2.6.32.ovz/arch/s390/include/asm/runtime_instr.h --- linux-2.6.32/arch/s390/include/asm/runtime_instr.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/runtime_instr.h 2015-03-10 13:23:48.000000000 -0700 @@ -0,0 +1,98 @@ +#ifndef _RUNTIME_INSTR_H +#define _RUNTIME_INSTR_H + +#define S390_RUNTIME_INSTR_START 0x1 +#define S390_RUNTIME_INSTR_STOP 0x2 + +struct runtime_instr_cb { + __u64 buf_current; + __u64 buf_origin; + __u64 buf_limit; + + __u32 valid : 1; + __u32 pstate : 1; + __u32 pstate_set_buf : 1; + __u32 home_space : 1; + __u32 altered : 1; + __u32 : 3; + __u32 pstate_sample : 1; + __u32 sstate_sample : 1; + __u32 pstate_collect : 1; + __u32 sstate_collect : 1; + __u32 : 1; + __u32 halted_int : 1; + __u32 int_requested : 1; + __u32 buffer_full_int : 1; + __u32 key : 4; + __u32 : 9; + __u32 rgs : 3; + + __u32 mode : 4; + __u32 next : 1; + __u32 mae : 1; + __u32 : 2; + __u32 call_type_br : 1; + __u32 return_type_br : 1; + __u32 other_type_br : 1; + __u32 bc_other_type : 1; + __u32 emit : 1; + __u32 tx_abort : 1; + __u32 : 2; + __u32 bp_xn : 1; + __u32 bp_xt : 1; + __u32 bp_ti : 1; + __u32 bp_ni : 1; + __u32 suppr_y : 1; + __u32 suppr_z : 1; + + __u32 dc_miss_extra : 1; + __u32 lat_lev_ignore : 1; + __u32 ic_lat_lev : 4; + __u32 dc_lat_lev : 4; + + __u64 reserved1; + __u64 scaling_factor; + __u64 rsic; + __u64 reserved2; +} __packed __aligned(8); + +extern struct runtime_instr_cb runtime_instr_empty_cb; + +static inline void load_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + asm volatile(".insn rsy,0xeb0000000060,0,0,%0" /* LRIC */ + : : "Q" (*cb)); +} + +static inline void store_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + asm volatile(".insn rsy,0xeb0000000061,0,0,%0" /* STRIC */ + : "=Q" (*cb) : : "cc"); +} + +static inline void save_ri_cb(struct runtime_instr_cb *cb_prev) +{ +#ifdef CONFIG_64BIT + if (cb_prev) + store_runtime_instr_cb(cb_prev); +#endif +} + +static inline void restore_ri_cb(struct runtime_instr_cb *cb_next, + struct runtime_instr_cb *cb_prev) +{ +#ifdef CONFIG_64BIT + if (cb_next) + load_runtime_instr_cb(cb_next); + else if (cb_prev) + load_runtime_instr_cb(&runtime_instr_empty_cb); +#endif +} + +#ifdef CONFIG_64BIT +extern void exit_thread_runtime_instr(void); +#else +static inline void exit_thread_runtime_instr(void) { } +#endif + +#endif /* _RUNTIME_INSTR_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/s390_ext.h linux-2.6.32.ovz/arch/s390/include/asm/s390_ext.h --- linux-2.6.32/arch/s390/include/asm/s390_ext.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/s390_ext.h 2015-03-10 13:23:48.000000000 -0700 @@ -29,4 +29,7 @@ int unregister_external_interrupt(__u16 int unregister_early_external_interrupt(__u16 code, ext_int_handler_t handler, ext_int_info_t *info); +void measurement_alert_subclass_register(void); +void measurement_alert_subclass_unregister(void); + #endif diff -uprN linux-2.6.32/arch/s390/include/asm/scsw.h linux-2.6.32.ovz/arch/s390/include/asm/scsw.h --- linux-2.6.32/arch/s390/include/asm/scsw.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/scsw.h 2015-03-10 13:23:44.000000000 -0700 @@ -1,7 +1,7 @@ /* * Helper functions for scsw access. * - * Copyright IBM Corp. 2008,2009 + * Copyright IBM Corp. 2008, 2012 * Author(s): Peter Oberparleiter */ @@ -9,7 +9,7 @@ #define _ASM_S390_SCSW_H_ #include -#include +#include #include /** @@ -100,14 +100,46 @@ struct tm_scsw { } __attribute__ ((packed)); /** + * struct eadm_scsw - subchannel status word for eadm subchannels + * @key: subchannel key + * @eswf: esw format + * @cc: deferred condition code + * @ectl: extended control + * @fctl: function control + * @actl: activity control + * @stctl: status control + * @aob: AOB address + * @dstat: device status + * @cstat: subchannel status + */ +struct eadm_scsw { + u32 key:4; + u32:1; + u32 eswf:1; + u32 cc:2; + u32:6; + u32 ectl:1; + u32:2; + u32 fctl:3; + u32 actl:7; + u32 stctl:5; + u32 aob; + u32 dstat:8; + u32 cstat:8; + u32:16; +} __packed; + +/** * union scsw - subchannel status word * @cmd: command-mode SCSW * @tm: transport-mode SCSW + * @eadm: eadm SCSW */ union scsw { struct cmd_scsw cmd; struct tm_scsw tm; -} __attribute__ ((packed)); + struct eadm_scsw eadm; +} __packed; #define SCSW_FCTL_CLEAR_FUNC 0x1 #define SCSW_FCTL_HALT_FUNC 0x2 diff -uprN linux-2.6.32/arch/s390/include/asm/setup.h linux-2.6.32.ovz/arch/s390/include/asm/setup.h --- linux-2.6.32/arch/s390/include/asm/setup.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/setup.h 2015-03-10 13:23:48.000000000 -0700 @@ -2,7 +2,7 @@ * include/asm-s390/setup.h * * S390 version - * Copyright IBM Corp. 1999,2006 + * Copyright IBM Corp. 1999,2010 */ #ifndef _ASM_S390_SETUP_H @@ -26,15 +26,21 @@ #define IPL_DEVICE (*(unsigned long *) (0x10404)) #define INITRD_START (*(unsigned long *) (0x1040C)) #define INITRD_SIZE (*(unsigned long *) (0x10414)) +#define OLDMEM_BASE (*(unsigned long *) (0x1041C)) +#define OLDMEM_SIZE (*(unsigned long *) (0x10424)) #else /* __s390x__ */ #define IPL_DEVICE (*(unsigned long *) (0x10400)) #define INITRD_START (*(unsigned long *) (0x10408)) #define INITRD_SIZE (*(unsigned long *) (0x10410)) +#define OLDMEM_BASE (*(unsigned long *) (0x10418)) +#define OLDMEM_SIZE (*(unsigned long *) (0x10420)) #endif /* __s390x__ */ #define COMMAND_LINE ((char *) (0x10480)) #define CHUNK_READ_WRITE 0 #define CHUNK_READ_ONLY 1 +#define CHUNK_OLDMEM 4 +#define CHUNK_CRASHK 5 struct mem_chunk { unsigned long addr; @@ -48,18 +54,15 @@ extern int memory_end_set; extern unsigned long memory_end; void detect_memory_layout(struct mem_chunk chunk[]); +void create_mem_hole(struct mem_chunk memory_chunk[], unsigned long addr, + unsigned long size, int type); -#ifdef CONFIG_S390_SWITCH_AMODE -extern unsigned int switch_amode; -#else -#define switch_amode (0) -#endif - -#ifdef CONFIG_S390_EXEC_PROTECT -extern unsigned int s390_noexec; -#else -#define s390_noexec (0) -#endif +#define PRIMARY_SPACE_MODE 0 +#define ACCESS_REGISTER_MODE 1 +#define SECONDARY_SPACE_MODE 2 +#define HOME_SPACE_MODE 3 + +extern unsigned int user_mode; /* * Machine features detected in head.S @@ -76,6 +79,8 @@ extern unsigned int s390_noexec; #define MACHINE_FLAG_KVM (1UL << 9) #define MACHINE_FLAG_HPAGE (1UL << 10) #define MACHINE_FLAG_PFMF (1UL << 11) +#define MACHINE_FLAG_SPP (1UL << 13) +#define MACHINE_FLAG_TE (1UL << 14) #define MACHINE_IS_VM (S390_lowcore.machine_flags & MACHINE_FLAG_VM) #define MACHINE_IS_KVM (S390_lowcore.machine_flags & MACHINE_FLAG_KVM) @@ -90,6 +95,8 @@ extern unsigned int s390_noexec; #define MACHINE_HAS_MVCOS (0) #define MACHINE_HAS_HPAGE (0) #define MACHINE_HAS_PFMF (0) +#define MACHINE_HAS_SPP (0) +#define MACHINE_HAS_TE (0) #else /* __s390x__ */ #define MACHINE_HAS_IEEE (1) #define MACHINE_HAS_CSP (1) @@ -99,9 +106,12 @@ extern unsigned int s390_noexec; #define MACHINE_HAS_MVCOS (S390_lowcore.machine_flags & MACHINE_FLAG_MVCOS) #define MACHINE_HAS_HPAGE (S390_lowcore.machine_flags & MACHINE_FLAG_HPAGE) #define MACHINE_HAS_PFMF (S390_lowcore.machine_flags & MACHINE_FLAG_PFMF) +#define MACHINE_HAS_SPP (S390_lowcore.machine_flags & MACHINE_FLAG_SPP) +#define MACHINE_HAS_TE (S390_lowcore.machine_flags & MACHINE_FLAG_TE) #endif /* __s390x__ */ #define ZFCPDUMP_HSA_SIZE (32UL<<20) +#define ZFCPDUMP_HSA_SIZE_MAX (64UL<<20) /* * Console mode. Override with conmode= @@ -130,10 +140,14 @@ extern char kernel_nss_name[]; #define IPL_DEVICE 0x10404 #define INITRD_START 0x1040C #define INITRD_SIZE 0x10414 +#define OLDMEM_BASE 0x1041C +#define OLDMEM_SIZE 0x10424 #else /* __s390x__ */ #define IPL_DEVICE 0x10400 #define INITRD_START 0x10408 #define INITRD_SIZE 0x10410 +#define OLDMEM_BASE 0x10418 +#define OLDMEM_SIZE 0x10420 #endif /* __s390x__ */ #define COMMAND_LINE 0x10480 diff -uprN linux-2.6.32/arch/s390/include/asm/sigcontext.h linux-2.6.32.ovz/arch/s390/include/asm/sigcontext.h --- linux-2.6.32/arch/s390/include/asm/sigcontext.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/sigcontext.h 2015-03-10 13:24:04.000000000 -0700 @@ -51,6 +51,9 @@ typedef struct typedef struct { unsigned int fpc; +#ifndef __GENKSYMS__ + unsigned int pad; +#endif double fprs[__NUM_FPRS]; } _s390_fp_regs; diff -uprN linux-2.6.32/arch/s390/include/asm/sigp.h linux-2.6.32.ovz/arch/s390/include/asm/sigp.h --- linux-2.6.32/arch/s390/include/asm/sigp.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/sigp.h 2015-03-10 13:23:04.000000000 -0700 @@ -15,11 +15,19 @@ #ifndef __SIGP__ #define __SIGP__ -#include -#include +#include /* get real cpu address from logical cpu number */ -extern volatile int __cpu_logical_map[]; +extern int __cpu_logical_map[]; + +static inline int cpu_logical_map(int cpu) +{ +#ifdef CONFIG_SMP + return __cpu_logical_map[cpu]; +#else + return stap(); +#endif +} typedef enum { @@ -62,6 +70,7 @@ typedef enum ec_schedule=0, ec_call_function, ec_call_function_single, + ec_stop_cpu, ec_bit_last } ec_bit_sig; @@ -79,7 +88,7 @@ signal_processor(__u16 cpu_addr, sigp_or " ipm %0\n" " srl %0,28\n" : "=d" (ccode) - : "d" (reg1), "d" (__cpu_logical_map[cpu_addr]), + : "d" (reg1), "d" (cpu_logical_map(cpu_addr)), "a" (order_code) : "cc" , "memory"); return ccode; } @@ -98,7 +107,7 @@ signal_processor_p(__u32 parameter, __u1 " ipm %0\n" " srl %0,28\n" : "=d" (ccode) - : "d" (reg1), "d" (__cpu_logical_map[cpu_addr]), + : "d" (reg1), "d" (cpu_logical_map(cpu_addr)), "a" (order_code) : "cc" , "memory"); return ccode; } @@ -118,10 +127,28 @@ signal_processor_ps(__u32 *statusptr, __ " ipm %0\n" " srl %0,28\n" : "=d" (ccode), "+d" (reg1) - : "d" (__cpu_logical_map[cpu_addr]), "a" (order_code) + : "d" (cpu_logical_map(cpu_addr)), "a" (order_code) : "cc" , "memory"); *statusptr = reg1; return ccode; } +/* + * Signal processor + */ +static inline int raw_sigp(u16 cpu, int order) +{ + register unsigned long reg1 asm ("1") = 0; + int ccode; + + asm volatile( + " sigp %1,%2,0(%3)\n" + " ipm %0\n" + " srl %0,28\n" + : "=d" (ccode) + : "d" (reg1), "d" (cpu), + "a" (order) : "cc" , "memory"); + return ccode; +} + #endif /* __SIGP__ */ diff -uprN linux-2.6.32/arch/s390/include/asm/smp.h linux-2.6.32.ovz/arch/s390/include/asm/smp.h --- linux-2.6.32/arch/s390/include/asm/smp.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/smp.h 2015-03-10 13:24:15.000000000 -0700 @@ -35,6 +35,7 @@ typedef struct extern void machine_restart_smp(char *); extern void machine_halt_smp(void); extern void machine_power_off_smp(void); +extern void smp_restart_with_online_cpu(void); #define NO_PROC_ID 0xFF /* No processor magic marker */ @@ -51,7 +52,6 @@ extern void machine_power_off_smp(void); #define PROC_CHANGE_PENALTY 20 /* Schedule penalty */ #define raw_smp_processor_id() (S390_lowcore.cpu_nr) -#define cpu_logical_map(cpu) (cpu) extern int __cpu_disable (void); extern void __cpu_die (unsigned int cpu); diff -uprN linux-2.6.32/arch/s390/include/asm/socket.h linux-2.6.32.ovz/arch/s390/include/asm/socket.h --- linux-2.6.32/arch/s390/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/socket.h 2015-03-10 13:24:07.000000000 -0700 @@ -30,7 +30,7 @@ #define SO_PRIORITY 12 #define SO_LINGER 13 #define SO_BSDCOMPAT 14 -/* To add :#define SO_REUSEPORT 15 */ +#define SO_REUSEPORT 15 #define SO_PASSCRED 16 #define SO_PEERCRED 17 #define SO_RCVLOWAT 18 @@ -68,4 +68,9 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 + +#define SO_BUSY_POLL 46 +#define SO_BPF_EXTENSIONS 48 + #endif /* _ASM_SOCKET_H */ diff -uprN linux-2.6.32/arch/s390/include/asm/sparsemem.h linux-2.6.32.ovz/arch/s390/include/asm/sparsemem.h --- linux-2.6.32/arch/s390/include/asm/sparsemem.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/sparsemem.h 2015-03-10 13:23:58.000000000 -0700 @@ -4,8 +4,13 @@ #ifdef CONFIG_64BIT #define SECTION_SIZE_BITS 28 +#ifdef __GENKSYMS__ #define MAX_PHYSADDR_BITS 42 #define MAX_PHYSMEM_BITS 42 +#else +#define MAX_PHYSADDR_BITS 46 +#define MAX_PHYSMEM_BITS 46 +#endif #else diff -uprN linux-2.6.32/arch/s390/include/asm/statfs.h linux-2.6.32.ovz/arch/s390/include/asm/statfs.h --- linux-2.6.32/arch/s390/include/asm/statfs.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/statfs.h 2015-03-10 13:22:28.000000000 -0700 @@ -33,7 +33,8 @@ struct statfs { __kernel_fsid_t f_fsid; int f_namelen; int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; struct statfs64 { @@ -47,7 +48,8 @@ struct statfs64 { __kernel_fsid_t f_fsid; int f_namelen; int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; struct compat_statfs64 { @@ -61,7 +63,8 @@ struct compat_statfs64 { __kernel_fsid_t f_fsid; __u32 f_namelen; __u32 f_frsize; - __u32 f_spare[5]; + __u32 f_flags; + __u32 f_spare[4]; }; #endif /* __s390x__ */ diff -uprN linux-2.6.32/arch/s390/include/asm/sysinfo.h linux-2.6.32.ovz/arch/s390/include/asm/sysinfo.h --- linux-2.6.32/arch/s390/include/asm/sysinfo.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/sysinfo.h 2015-03-10 13:22:07.000000000 -0700 @@ -15,7 +15,10 @@ #define __ASM_S390_SYSINFO_H struct sysinfo_1_1_1 { - char reserved_0[32]; + unsigned short :16; + unsigned char ccr; + unsigned char cai; + char reserved_0[28]; char manufacturer[16]; char type[4]; char reserved_1[12]; diff -uprN linux-2.6.32/arch/s390/include/asm/system.h linux-2.6.32.ovz/arch/s390/include/asm/system.h --- linux-2.6.32/arch/s390/include/asm/system.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/system.h 2015-03-10 13:24:14.000000000 -0700 @@ -20,59 +20,96 @@ struct task_struct; extern struct task_struct *__switch_to(void *, void *); +extern void update_per_regs(struct task_struct *task); -static inline void save_fp_regs(s390_fp_regs *fpregs) +static inline int test_fp_ctl(u32 fpc) { + u32 orig_fpc; + int rc; + + if (!MACHINE_HAS_IEEE) + return 0; + asm volatile( - " std 0,8(%1)\n" - " std 2,24(%1)\n" - " std 4,40(%1)\n" - " std 6,56(%1)" - : "=m" (*fpregs) : "a" (fpregs), "m" (*fpregs) : "memory"); + " efpc %1\n" + " sfpc %2\n" + "0: sfpc %1\n" + " la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc), "=d" (orig_fpc) + : "d" (fpc), "0" (-EINVAL)); + return rc; +} + +static inline void save_fp_ctl(u32 *fpc) +{ if (!MACHINE_HAS_IEEE) return; + asm volatile( - " stfpc 0(%1)\n" - " std 1,16(%1)\n" - " std 3,32(%1)\n" - " std 5,48(%1)\n" - " std 7,64(%1)\n" - " std 8,72(%1)\n" - " std 9,80(%1)\n" - " std 10,88(%1)\n" - " std 11,96(%1)\n" - " std 12,104(%1)\n" - " std 13,112(%1)\n" - " std 14,120(%1)\n" - " std 15,128(%1)\n" - : "=m" (*fpregs) : "a" (fpregs), "m" (*fpregs) : "memory"); + " stfpc %0\n" + : "+Q" (*fpc)); } -static inline void restore_fp_regs(s390_fp_regs *fpregs) +static inline int restore_fp_ctl(u32 *fpc) { + int rc; + + if (!MACHINE_HAS_IEEE) + return 0; + asm volatile( - " ld 0,8(%0)\n" - " ld 2,24(%0)\n" - " ld 4,40(%0)\n" - " ld 6,56(%0)" - : : "a" (fpregs), "m" (*fpregs)); + " lfpc %1\n" + "0: la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc) : "Q" (*fpc), "0" (-EINVAL)); + return rc; +} + +static inline void save_fp_regs(freg_t *fprs) +{ + asm volatile("std 0,%0" : "=Q" (fprs[0])); + asm volatile("std 2,%0" : "=Q" (fprs[2])); + asm volatile("std 4,%0" : "=Q" (fprs[4])); + asm volatile("std 6,%0" : "=Q" (fprs[6])); if (!MACHINE_HAS_IEEE) return; - asm volatile( - " lfpc 0(%0)\n" - " ld 1,16(%0)\n" - " ld 3,32(%0)\n" - " ld 5,48(%0)\n" - " ld 7,64(%0)\n" - " ld 8,72(%0)\n" - " ld 9,80(%0)\n" - " ld 10,88(%0)\n" - " ld 11,96(%0)\n" - " ld 12,104(%0)\n" - " ld 13,112(%0)\n" - " ld 14,120(%0)\n" - " ld 15,128(%0)\n" - : : "a" (fpregs), "m" (*fpregs)); + asm volatile("std 1,%0" : "=Q" (fprs[1])); + asm volatile("std 3,%0" : "=Q" (fprs[3])); + asm volatile("std 5,%0" : "=Q" (fprs[5])); + asm volatile("std 7,%0" : "=Q" (fprs[7])); + asm volatile("std 8,%0" : "=Q" (fprs[8])); + asm volatile("std 9,%0" : "=Q" (fprs[9])); + asm volatile("std 10,%0" : "=Q" (fprs[10])); + asm volatile("std 11,%0" : "=Q" (fprs[11])); + asm volatile("std 12,%0" : "=Q" (fprs[12])); + asm volatile("std 13,%0" : "=Q" (fprs[13])); + asm volatile("std 14,%0" : "=Q" (fprs[14])); + asm volatile("std 15,%0" : "=Q" (fprs[15])); +} + +static inline void restore_fp_regs(freg_t *fprs) +{ + asm volatile("ld 0,%0" : : "Q" (fprs[0])); + asm volatile("ld 2,%0" : : "Q" (fprs[2])); + asm volatile("ld 4,%0" : : "Q" (fprs[4])); + asm volatile("ld 6,%0" : : "Q" (fprs[6])); + if (!MACHINE_HAS_IEEE) + return; + asm volatile("ld 1,%0" : : "Q" (fprs[1])); + asm volatile("ld 3,%0" : : "Q" (fprs[3])); + asm volatile("ld 5,%0" : : "Q" (fprs[5])); + asm volatile("ld 7,%0" : : "Q" (fprs[7])); + asm volatile("ld 8,%0" : : "Q" (fprs[8])); + asm volatile("ld 9,%0" : : "Q" (fprs[9])); + asm volatile("ld 10,%0" : : "Q" (fprs[10])); + asm volatile("ld 11,%0" : : "Q" (fprs[11])); + asm volatile("ld 12,%0" : : "Q" (fprs[12])); + asm volatile("ld 13,%0" : : "Q" (fprs[13])); + asm volatile("ld 14,%0" : : "Q" (fprs[14])); + asm volatile("ld 15,%0" : : "Q" (fprs[15])); } static inline void save_access_regs(unsigned int *acrs) @@ -88,10 +125,15 @@ static inline void restore_access_regs(u #define switch_to(prev,next,last) do { \ if (prev == next) \ break; \ - save_fp_regs(&prev->thread.fp_regs); \ - restore_fp_regs(&next->thread.fp_regs); \ + save_fp_ctl(&prev->thread.fp_regs.fpc); \ + save_fp_regs(prev->thread.fp_regs.fprs); \ + restore_fp_ctl(&next->thread.fp_regs.fpc); \ + restore_fp_regs(next->thread.fp_regs.fprs); \ save_access_regs(&prev->thread.acrs[0]); \ + save_ri_cb(prev->thread.ri_cb); \ restore_access_regs(&next->thread.acrs[0]); \ + restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ + update_per_regs(next); \ prev = __switch_to(prev,next); \ } while (0) @@ -110,6 +152,10 @@ extern void pfault_fini(void); #endif /* CONFIG_PFAULT */ extern void cmma_init(void); +extern int memcpy_real(void *, void *, size_t); +extern void copy_to_absolute_zero(void *dest, void *src, size_t count); +extern int copy_to_user_real(void __user *dest, void *src, size_t count); +extern int copy_from_user_real(void *dest, void __user *src, size_t count); #define finish_arch_switch(prev) do { \ set_fs(current->thread.mm_segment); \ @@ -456,11 +502,6 @@ extern void (*_machine_power_off)(void); #define arch_align_stack(x) (x) -#ifdef CONFIG_TRACE_IRQFLAGS -extern psw_t sysc_restore_trace_psw; -extern psw_t io_restore_trace_psw; -#endif - static inline int tprot(unsigned long addr) { int rc = -EFAULT; diff -uprN linux-2.6.32/arch/s390/include/asm/thread_info.h linux-2.6.32.ovz/arch/s390/include/asm/thread_info.h --- linux-2.6.32/arch/s390/include/asm/thread_info.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/thread_info.h 2015-03-10 13:23:03.000000000 -0700 @@ -31,6 +31,7 @@ #define ASYNC_SIZE (PAGE_SIZE << ASYNC_ORDER) #ifndef __ASSEMBLY__ +#include #include #include #include @@ -50,6 +51,10 @@ struct thread_info { struct restart_block restart_block; __u64 user_timer; __u64 system_timer; +#ifndef __GENKSYMS__ + unsigned long last_break; /* last breaking-event-address. */ + struct list_head list; /* pfault task list */ +#endif }; /* @@ -117,6 +122,12 @@ static inline struct thread_info *curren #define _TIF_31BIT (1<> 9; + * + * In order to avoid an overflow with the multiplication we can rewrite this. + * With a split todval == 2^32 * th + tl (th upper 32 bits, tl lower 32 bits) + * we end up with + * + * ns = ((2^32 * th + tl) * 125 ) >> 9; + * -> ns = (2^23 * th * 125) + ((tl * 125) >> 9); + * + */ +static inline unsigned long long tod_to_ns(unsigned long long todval) +{ + unsigned long long ns; + + ns = ((todval >> 32) << 23) * 125; + ns += ((todval & 0xffffffff) * 125) >> 9; + return ns; +} + #endif diff -uprN linux-2.6.32/arch/s390/include/asm/tlbflush.h linux-2.6.32.ovz/arch/s390/include/asm/tlbflush.h --- linux-2.6.32/arch/s390/include/asm/tlbflush.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/tlbflush.h 2015-03-10 13:23:56.000000000 -0700 @@ -73,8 +73,6 @@ static inline void __tlb_flush_idte(unsi static inline void __tlb_flush_mm(struct mm_struct * mm) { - if (unlikely(cpumask_empty(mm_cpumask(mm)))) - return; /* * If the machine has IDTE we prefer to do a per mm flush * on all cpus instead of doing a local flush if the mm @@ -94,8 +92,12 @@ static inline void __tlb_flush_mm(struct static inline void __tlb_flush_mm_cond(struct mm_struct * mm) { - if (atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm) + spin_lock(&mm->page_table_lock); + if (mm->context.flush_mm) { __tlb_flush_mm(mm); + mm->context.flush_mm = 0; + } + spin_unlock(&mm->page_table_lock); } /* diff -uprN linux-2.6.32/arch/s390/include/asm/tlb.h linux-2.6.32.ovz/arch/s390/include/asm/tlb.h --- linux-2.6.32/arch/s390/include/asm/tlb.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/tlb.h 2015-03-10 13:21:44.000000000 -0700 @@ -50,8 +50,7 @@ static inline struct mmu_gather *tlb_gat struct mmu_gather *tlb = &get_cpu_var(mmu_gathers); tlb->mm = mm; - tlb->fullmm = full_mm_flush || (num_online_cpus() == 1) || - (atomic_read(&mm->mm_users) <= 1 && mm == current->active_mm); + tlb->fullmm = full_mm_flush; tlb->nr_ptes = 0; tlb->nr_pxds = TLB_NR_PTRS; if (tlb->fullmm) diff -uprN linux-2.6.32/arch/s390/include/asm/topology.h linux-2.6.32.ovz/arch/s390/include/asm/topology.h --- linux-2.6.32/arch/s390/include/asm/topology.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/topology.h 2015-03-10 13:22:07.000000000 -0700 @@ -3,13 +3,32 @@ #include -#define mc_capable() (1) - -const struct cpumask *cpu_coregroup_mask(unsigned int cpu); - +extern unsigned char cpu_core_id[NR_CPUS]; extern cpumask_t cpu_core_map[NR_CPUS]; +static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu) +{ + return &cpu_core_map[cpu]; +} + +#define topology_core_id(cpu) (cpu_core_id[cpu]) #define topology_core_cpumask(cpu) (&cpu_core_map[cpu]) +#define mc_capable() (1) + +#ifdef CONFIG_SCHED_BOOK + +extern unsigned char cpu_book_id[NR_CPUS]; +extern cpumask_t cpu_book_map[NR_CPUS]; + +static inline const struct cpumask *cpu_book_mask(unsigned int cpu) +{ + return &cpu_book_map[cpu]; +} + +#define topology_book_id(cpu) (cpu_book_id[cpu]) +#define topology_book_cpumask(cpu) (&cpu_book_map[cpu]) + +#endif /* CONFIG_SCHED_BOOK */ int topology_set_cpu_management(int fc); void topology_schedule_update(void); @@ -28,7 +47,7 @@ static inline void s390_init_cpu_topolog }; #endif -#define SD_MC_INIT SD_CPU_INIT +#define SD_BOOK_INIT SD_CPU_INIT #include diff -uprN linux-2.6.32/arch/s390/include/asm/trace_clock.h linux-2.6.32.ovz/arch/s390/include/asm/trace_clock.h --- linux-2.6.32/arch/s390/include/asm/trace_clock.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/trace_clock.h 2015-03-10 13:23:58.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/s390/include/asm/types.h linux-2.6.32.ovz/arch/s390/include/asm/types.h --- linux-2.6.32/arch/s390/include/asm/types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/types.h 2015-03-10 13:23:18.000000000 -0700 @@ -31,12 +31,6 @@ typedef __signed__ long saddr_t; #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; -#ifdef __s390x__ -/* DMA addresses come in 32-bit and 64-bit flavours. */ -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif #ifndef __s390x__ typedef union { diff -uprN linux-2.6.32/arch/s390/include/asm/uaccess.h linux-2.6.32.ovz/arch/s390/include/asm/uaccess.h --- linux-2.6.32/arch/s390/include/asm/uaccess.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/uaccess.h 2015-03-10 13:20:59.000000000 -0700 @@ -93,6 +93,8 @@ extern struct uaccess_ops uaccess_mvcos; extern struct uaccess_ops uaccess_mvcos_switch; extern struct uaccess_ops uaccess_pt; +extern int __handle_fault(unsigned long, unsigned long, int); + static inline int __put_user_fn(size_t size, void __user *ptr, void *x) { size = uaccess.copy_to_user_small(size, ptr, x); diff -uprN linux-2.6.32/arch/s390/include/asm/unistd.h linux-2.6.32.ovz/arch/s390/include/asm/unistd.h --- linux-2.6.32/arch/s390/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/unistd.h 2015-03-10 13:23:54.000000000 -0700 @@ -269,7 +269,17 @@ #define __NR_pwritev 329 #define __NR_rt_tgsigqueueinfo 330 #define __NR_perf_event_open 331 -#define NR_syscalls 332 +/* #define __NR_fanotify_init 332 */ +/* #define __NR_fanotify_mark 333 */ +/* #define __NR_prlimit64 334 */ +/* #define __NR_name_to_handle_at 335 */ +/* #define __NR_open_by_handle_at 336 */ +/* #define __NR_clock_adjtime 337 */ +#define __NR_syncfs 338 +/* whole due to s390_runtime_instr */ +#define __NR_s390_runtime_instr 342 +#define __NR_setns 343 +#define NR_syscalls 344 /* * There are some system calls that are not present on 64 bit, some @@ -376,6 +386,9 @@ #define __IGNORE_migrate_pages #define __IGNORE_move_pages +/* Ignore system calls that are also reachable via sys_socket */ +#define __IGNORE_recvmmsg + #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM diff -uprN linux-2.6.32/arch/s390/include/asm/vdso.h linux-2.6.32.ovz/arch/s390/include/asm/vdso.h --- linux-2.6.32/arch/s390/include/asm/vdso.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/vdso.h 2015-03-10 13:21:22.000000000 -0700 @@ -7,7 +7,7 @@ #define VDSO32_LBASE 0 #define VDSO64_LBASE 0 -#define VDSO_VERSION_STRING LINUX_2.6.26 +#define VDSO_VERSION_STRING LINUX_2.6.29 #ifndef __ASSEMBLY__ @@ -29,6 +29,7 @@ struct vdso_data { __u32 tz_minuteswest; /* Minutes west of Greenwich 0x30 */ __u32 tz_dsttime; /* Type of dst correction 0x34 */ __u32 ectg_available; + __u32 ntp_mult; /* NTP adjusted multiplier 0x3C */ }; struct vdso_per_cpu_data { diff -uprN linux-2.6.32/arch/s390/include/asm/zcrypt.h linux-2.6.32.ovz/arch/s390/include/asm/zcrypt.h --- linux-2.6.32/arch/s390/include/asm/zcrypt.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/include/asm/zcrypt.h 2015-03-10 13:24:08.000000000 -0700 @@ -154,6 +154,67 @@ struct ica_xcRB { unsigned short priority_window; unsigned int status; } __attribute__((packed)); + +/** + * struct ep11_cprb - EP11 connectivity programming request block + * @cprb_len: CPRB header length [0x0020] + * @cprb_ver_id: CPRB version id. [0x04] + * @pad_000: Alignment pad bytes + * @flags: Admin cmd [0x80] or functional cmd [0x00] + * @func_id: Function id / subtype [0x5434] + * @source_id: Source id [originator id] + * @target_id: Target id [usage/ctrl domain id] + * @ret_code: Return code + * @reserved1: Reserved + * @reserved2: Reserved + * @payload_len: Payload length + */ +struct ep11_cprb { + uint16_t cprb_len; + unsigned char cprb_ver_id; + unsigned char pad_000[2]; + unsigned char flags; + unsigned char func_id[2]; + uint32_t source_id; + uint32_t target_id; + uint32_t ret_code; + uint32_t reserved1; + uint32_t reserved2; + uint32_t payload_len; +} __attribute__((packed)); + +/** + * struct ep11_target_dev - EP11 target device list + * @ap_id: AP device id + * @dom_id: Usage domain id + */ +struct ep11_target_dev { + uint16_t ap_id; + uint16_t dom_id; +}; + +/** + * struct ep11_urb - EP11 user request block + * @targets_num: Number of target adapters + * @targets: Addr to target adapter list + * @weight: Level of request priority + * @req_no: Request id/number + * @req_len: Request length + * @req: Addr to request block + * @resp_len: Response length + * @resp: Addr to response block + */ +struct ep11_urb { + uint16_t targets_num; + uint64_t targets; + uint64_t weight; + uint64_t req_no; + uint64_t req_len; + uint64_t req; + uint64_t resp_len; + uint64_t resp; +} __attribute__((packed)); + #define AUTOSELECT ((unsigned int)0xFFFFFFFF) #define ZCRYPT_IOCTL_MAGIC 'z' @@ -183,6 +244,9 @@ struct ica_xcRB { * ZSECSENDCPRB * Send an arbitrary CPRB to a crypto card. * + * ZSENDEP11CPRB + * Send an arbitrary EP11 CPRB to an EP11 coprocessor crypto card. + * * Z90STAT_STATUS_MASK * Return an 64 element array of unsigned chars for the status of * all devices. @@ -256,6 +320,7 @@ struct ica_xcRB { #define ICARSAMODEXPO _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x05, 0) #define ICARSACRT _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x06, 0) #define ZSECSENDCPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x81, 0) +#define ZSENDEP11CPRB _IOC(_IOC_READ|_IOC_WRITE, ZCRYPT_IOCTL_MAGIC, 0x04, 0) /* New status calls */ #define Z90STAT_TOTALCOUNT _IOR(ZCRYPT_IOCTL_MAGIC, 0x40, int) diff -uprN linux-2.6.32/arch/s390/Kconfig linux-2.6.32.ovz/arch/s390/Kconfig --- linux-2.6.32/arch/s390/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/Kconfig 2015-03-10 13:23:44.000000000 -0700 @@ -40,9 +40,6 @@ config ARCH_HAS_ILOG2_U64 config GENERIC_HWEIGHT def_bool y -config GENERIC_TIME - def_bool y - config GENERIC_TIME_VSYSCALL def_bool y @@ -60,6 +57,9 @@ config NO_IOMEM config NO_DMA def_bool y +config ARCH_DMA_ADDR_T_64BIT + def_bool 64BIT + config GENERIC_LOCKBREAK bool default y @@ -87,14 +87,16 @@ config S390 select HAVE_SYSCALL_TRACEPOINTS select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_DEFAULT_NO_SPIN_MUTEXES + select HAVE_ARCH_MUTEX_CPU_RELAX select HAVE_OPROFILE select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_KVM if 64BIT select HAVE_ARCH_TRACEHOOK select INIT_ALL_POSSIBLE + select HAVE_IRQ_WORK select HAVE_PERF_EVENTS + select ARCH_HAVE_NMI_SAFE_CMPXCHG config SCHED_OMIT_FRAME_POINTER bool @@ -165,6 +167,13 @@ config HOTPLUG_CPU can be controlled through /sys/devices/system/cpu/cpu#. Say N if you want to disable CPU hotplug. +config SCHED_BOOK + bool "Book scheduler support" + depends on SMP + help + Book scheduler support improves the CPU scheduler's decision making + when dealing with machines that have several books. + config MATHEMU bool "IEEE FPU emulation" depends on MARCH_G5 @@ -192,23 +201,8 @@ config AUDIT_ARCH bool default y -config S390_SWITCH_AMODE - bool "Switch kernel/user addressing modes" - help - This option allows to switch the addressing modes of kernel and user - space. The kernel parameter switch_amode=on will enable this feature, - default is disabled. Enabling this (via kernel parameter) on machines - earlier than IBM System z9-109 EC/BC will reduce system performance. - - Note that this option will also be selected by selecting the execute - protection option below. Enabling the execute protection via the - noexec kernel parameter will also switch the addressing modes, - independent of the switch_amode kernel parameter. - - config S390_EXEC_PROTECT bool "Data execute protection" - select S390_SWITCH_AMODE help This option allows to enable a buffer overflow protection for user space programs and it also selects the addressing mode option above. @@ -390,6 +384,24 @@ config CHSC_SCH If unsure, say N. +config SCM_BUS + def_bool y + depends on 64BIT + prompt "SCM bus driver" + help + Bus driver for Storage Class Memory. + +config EADM_SCH + def_tristate m + prompt "Support for EADM subchannels" + depends on SCM_BUS + help + This driver allows usage of EADM subchannels. EADM subchannels act + as a communication vehicle for SCM increments. + + To compile this driver as a module, choose M here: the + module will be called eadm_sch. + comment "Misc" config IPL @@ -557,6 +569,30 @@ config KEXEC current kernel, and to start another kernel. It is like a reboot but is independent of hardware/microcode support. +config KEXEC_AUTO_RESERVE + bool "automatically reserve memory for kexec kernel" + depends on KEXEC + default y + ---help--- + Automatically reserve memory for a kexec kernel, so that you don't + need to specify numbers for the "crashkernel=X@Y" boot option, + instead you can use "crashkernel=auto". This option works for + systems that have at least 4 GiB memory. Then 128 MiB plus a small + amount of additional memory that grows linearly with the 1st kernel + memory size is reserved. When "crashkernel=auto" is used for + systems with less than 4 GiB memory, no crashkernel memory will be + reserved. + +config CRASH_DUMP + bool "kernel crash dumps" + depends on 64BIT + help + Generate crash dump after being started by kexec. + Crash dump kernels are loaded in the main kernel with kexec-tools + into a specially reserved region and then later executed after + a crash by kdump/kexec. + For more details see Documentation/kdump/kdump.txt + config ZFCPDUMP bool "zfcpdump support" select SMP @@ -618,6 +654,11 @@ source "arch/s390/Kconfig.debug" source "security/Kconfig" +config KEYS_COMPAT + bool + depends on COMPAT && KEYS + default y + source "crypto/Kconfig" source "lib/Kconfig" diff -uprN linux-2.6.32/arch/s390/Kconfig.debug linux-2.6.32.ovz/arch/s390/Kconfig.debug --- linux-2.6.32/arch/s390/Kconfig.debug 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/Kconfig.debug 2015-03-10 13:22:07.000000000 -0700 @@ -6,4 +6,17 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" +config STRICT_DEVMEM + def_bool y + prompt "Filter access to /dev/mem" + ---help--- + This option restricts access to /dev/mem. If this option is + disabled, you allow userspace access to all memory, including + kernel and userspace memory. Accidental memory access is likely + to be disastrous. + Memory access is required for experts who want to debug the kernel. + + If you are unsure, say Y. + + endmenu diff -uprN linux-2.6.32/arch/s390/kernel/asm-offsets.c linux-2.6.32.ovz/arch/s390/kernel/asm-offsets.c --- linux-2.6.32/arch/s390/kernel/asm-offsets.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/asm-offsets.c 2015-03-10 13:23:48.000000000 -0700 @@ -30,6 +30,7 @@ int main(void) DEFINE(__TI_precount, offsetof(struct thread_info, preempt_count)); DEFINE(__TI_user_timer, offsetof(struct thread_info, user_timer)); DEFINE(__TI_system_timer, offsetof(struct thread_info, system_timer)); + DEFINE(__TI_last_break, offsetof(struct thread_info, last_break)); BLANK(); DEFINE(__PT_ARGS, offsetof(struct pt_regs, args)); DEFINE(__PT_PSW, offsetof(struct pt_regs, psw)); @@ -52,6 +53,7 @@ int main(void) DEFINE(__VDSO_WTOM_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); DEFINE(__VDSO_TIMEZONE, offsetof(struct vdso_data, tz_minuteswest)); DEFINE(__VDSO_ECTG_OK, offsetof(struct vdso_data, ectg_available)); + DEFINE(__VDSO_NTP_MULT, offsetof(struct vdso_data, ntp_mult)); DEFINE(__VDSO_ECTG_BASE, offsetof(struct vdso_per_cpu_data, ectg_timer_base)); DEFINE(__VDSO_ECTG_USER, @@ -65,5 +67,9 @@ int main(void) DEFINE(__SIGP_RESTART, sigp_restart); DEFINE(__SIGP_SENSE, sigp_sense); DEFINE(__SIGP_INITIAL_CPU_RESET, sigp_initial_cpu_reset); + DEFINE(__LC_RST_OLD_PSW, offsetof(struct _lowcore, restart_old_psw)); + /* constants for transactional execution */ + DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb)); + DEFINE(__THREAD_trap_tdb, offsetof(struct task_struct, thread.trap_tdb)); return 0; } diff -uprN linux-2.6.32/arch/s390/kernel/base.S linux-2.6.32.ovz/arch/s390/kernel/base.S --- linux-2.6.32/arch/s390/kernel/base.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/base.S 2015-03-10 13:23:17.000000000 -0700 @@ -75,6 +75,61 @@ s390_base_pgm_handler_fn: .quad 0 .previous +# +# Calls diag 308 subcode 1 and continues execution +# +# The following conditions must be ensured before calling this function: +# * Prefix register = 0 +# * Lowcore protection is disabled +# + .globl do_reset_diag308 +do_reset_diag308: + larl %r4,.Lctlregs # Save control registers + stctg %c0,%c15,0(%r4) + larl %r4,.Lfpctl # Floating point control register + stfpc 0(%r4) + larl %r4,.Lcontinue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,.Lrestart_psw # Setup restart PSW at absolute 0 + lghi %r3,0 + lg %r4,0(%r4) # Save PSW + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + diag %r1,%r1,0x308 +.Lrestart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,0x12 # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,.Lctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,.Lfpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,.Lcontinue_psw # Restore PSW flags + lpswe 0(%r4) +.Lcontinue: + br %r14 +.align 16 +.Lrestart_psw: + .long 0x00080000,0x80000000 + .Lrestart_part2 + + .section .data..nosave,"aw",@progbits +.align 8 +.Lcontinue_psw: + .quad 0,.Lcontinue + .previous + + .section .bss +.align 8 +.Lctlregs: + .rept 16 + .quad 0 + .endr +.Lfpctl: + .long 0 + .previous + #else /* CONFIG_64BIT */ .globl s390_base_mcck_handler diff -uprN linux-2.6.32/arch/s390/kernel/compat_linux.c linux-2.6.32.ovz/arch/s390/kernel/compat_linux.c --- linux-2.6.32/arch/s390/kernel/compat_linux.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/compat_linux.c 2015-03-10 13:23:54.000000000 -0700 @@ -31,14 +31,8 @@ #include #include #include -#include #include #include -#include -#include -#include -#include -#include #include #include #include @@ -446,14 +440,14 @@ asmlinkage long sys32_execve(char __user compat_uptr_t __user *envp) { struct pt_regs *regs = task_pt_regs(current); - char *filename; + struct filename *filename; long rc; filename = getname(name); rc = PTR_ERR(filename); if (IS_ERR(filename)) return rc; - rc = compat_do_execve(filename, argv, envp, regs); + rc = compat_do_execve(filename->name, argv, envp, regs); if (rc) goto out; current->thread.fp_regs.fpc=0; @@ -683,38 +677,6 @@ struct mmap_arg_struct_emu31 { u32 offset; }; -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long error = -EBADF; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - if (!IS_ERR((void *) error) && error + len >= 0x80000000ULL) { - /* Result is out of bounds. */ - do_munmap(current->mm, addr, len); - error = -ENOMEM; - } - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - - asmlinkage unsigned long old32_mmap(struct mmap_arg_struct_emu31 __user *arg) { @@ -728,7 +690,8 @@ old32_mmap(struct mmap_arg_struct_emu31 if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); out: return error; } @@ -741,7 +704,7 @@ sys32_mmap2(struct mmap_arg_struct_emu31 if (copy_from_user(&a, arg, sizeof(a))) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); out: return error; } diff -uprN linux-2.6.32/arch/s390/kernel/compat_linux.h linux-2.6.32.ovz/arch/s390/kernel/compat_linux.h --- linux-2.6.32/arch/s390/kernel/compat_linux.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/compat_linux.h 2015-03-10 13:24:04.000000000 -0700 @@ -4,10 +4,6 @@ #include #include #include -#include -#include -#include -#include /* Macro that masks the high order bit of an 32 bit pointer and converts it*/ /* to a 64 bit pointer */ @@ -106,6 +102,9 @@ typedef union typedef struct { unsigned int fpc; +#ifndef __GENKSYMS__ + unsigned int pad; +#endif freg_t32 fprs[__NUM_FPRS]; } _s390_fp_regs32; diff -uprN linux-2.6.32/arch/s390/kernel/compat_signal.c linux-2.6.32.ovz/arch/s390/kernel/compat_signal.c --- linux-2.6.32/arch/s390/kernel/compat_signal.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/compat_signal.c 2015-03-10 13:24:04.000000000 -0700 @@ -297,51 +297,55 @@ sys32_sigaltstack(const stack_t32 __user static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) { - _s390_regs_common32 regs32; - int err, i; + _sigregs32 user_sregs; + int i; - regs32.psw.mask = PSW32_MASK_MERGE(psw32_user_bits, - (__u32)(regs->psw.mask >> 32)); - regs32.psw.addr = PSW32_ADDR_AMODE31 | (__u32) regs->psw.addr; + user_sregs.regs.psw.mask = PSW32_MASK_MERGE(psw32_user_bits, + (__u32)(regs->psw.mask >> 32)); + user_sregs.regs.psw.addr = PSW32_ADDR_AMODE31 | (__u32) regs->psw.addr; for (i = 0; i < NUM_GPRS; i++) - regs32.gprs[i] = (__u32) regs->gprs[i]; + user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; save_access_regs(current->thread.acrs); - memcpy(regs32.acrs, current->thread.acrs, sizeof(regs32.acrs)); - err = __copy_to_user(&sregs->regs, ®s32, sizeof(regs32)); - if (err) - return err; - save_fp_regs(¤t->thread.fp_regs); - /* s390_fp_regs and _s390_fp_regs32 are the same ! */ - return __copy_to_user(&sregs->fpregs, ¤t->thread.fp_regs, - sizeof(_s390_fp_regs32)); + memcpy(&user_sregs.regs.acrs, current->thread.acrs, + sizeof(user_sregs.regs.acrs)); + save_fp_ctl(¤t->thread.fp_regs.fpc); + save_fp_regs(current->thread.fp_regs.fprs); + memcpy(&user_sregs.fpregs, ¤t->thread.fp_regs, + sizeof(user_sregs.fpregs)); + if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) + return -EFAULT; + return 0; } static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) { - _s390_regs_common32 regs32; - int err, i; + _sigregs32 user_sregs; + int i; /* Alwys make any pending restarted system call return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; - err = __copy_from_user(®s32, &sregs->regs, sizeof(regs32)); - if (err) - return err; + if (__copy_from_user(&user_sregs, &sregs->regs, sizeof(user_sregs))) + return -EFAULT; + + /* Loading the floating-point-control word can fail. Do that first. */ + if (restore_fp_ctl(&user_sregs.fpregs.fpc)) + return -EINVAL; + + /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ regs->psw.mask = PSW_MASK_MERGE(regs->psw.mask, - (__u64)regs32.psw.mask << 32); - regs->psw.addr = (__u64)(regs32.psw.addr & PSW32_ADDR_INSN); + (__u64) user_sregs.regs.psw.mask << 32); + regs->psw.addr = (__u64)(user_sregs.regs.psw.addr & PSW32_ADDR_INSN); for (i = 0; i < NUM_GPRS; i++) - regs->gprs[i] = (__u64) regs32.gprs[i]; - memcpy(current->thread.acrs, regs32.acrs, sizeof(current->thread.acrs)); + regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; + memcpy(¤t->thread.acrs, user_sregs.regs.acrs, + sizeof(current->thread.acrs)); restore_access_regs(current->thread.acrs); - err = __copy_from_user(¤t->thread.fp_regs, &sregs->fpregs, - sizeof(_s390_fp_regs32)); - current->thread.fp_regs.fpc &= FPC_VALID_MASK; - if (err) - return err; + memcpy(¤t->thread.fp_regs, &user_sregs.fpregs, + sizeof(current->thread.fp_regs)); - restore_fp_regs(¤t->thread.fp_regs); + restore_fp_regs(current->thread.fp_regs.fprs); regs->svcnr = 0; /* disable syscall checks */ return 0; } @@ -353,18 +357,18 @@ static int save_sigregs_gprs_high(struct for (i = 0; i < NUM_GPRS; i++) gprs_high[i] = regs->gprs[i] >> 32; - - return __copy_to_user(uregs, &gprs_high, sizeof(gprs_high)); + if (__copy_to_user(uregs, &gprs_high, sizeof(gprs_high))) + return -EFAULT; + return 0; } static int restore_sigregs_gprs_high(struct pt_regs *regs, __u32 __user *uregs) { __u32 gprs_high[NUM_GPRS]; - int err, i; + int i; - err = __copy_from_user(&gprs_high, uregs, sizeof(gprs_high)); - if (err) - return err; + if (__copy_from_user(&gprs_high, uregs, sizeof(gprs_high))) + return -EFAULT; for (i = 0; i < NUM_GPRS; i++) *(__u32 *)®s->gprs[i] = gprs_high[i]; return 0; diff -uprN linux-2.6.32/arch/s390/kernel/compat_wrapper.S linux-2.6.32.ovz/arch/s390/kernel/compat_wrapper.S --- linux-2.6.32/arch/s390/kernel/compat_wrapper.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/compat_wrapper.S 2015-03-10 13:23:54.000000000 -0700 @@ -785,7 +785,7 @@ sys32_getresuid16_wrapper: sys32_poll_wrapper: llgtr %r2,%r2 # struct pollfd * llgfr %r3,%r3 # unsigned int - lgfr %r4,%r4 # long + lgfr %r4,%r4 # int jg sys_poll # branch to system call .globl compat_sys_nfsservctl_wrapper @@ -1855,3 +1855,20 @@ sys32_execve_wrapper: llgtr %r3,%r3 # compat_uptr_t * llgtr %r4,%r4 # compat_uptr_t * jg sys32_execve # branch to system call + + .globl sys_syncfs_wrapper +sys_syncfs_wrapper: + lgfr %r2,%r2 # int + jg sys_syncfs + + .globl sys_setns_wrapper +sys_setns_wrapper: + lgfr %r2,%r2 # int + lgfr %r3,%r3 # int + jg sys_setns + + .globl sys_s390_runtime_instr_wrapper +sys_s390_runtime_instr_wrapper: + lgfr %r2,%r2 # int + lgfr %r3,%r3 # int + jg sys_s390_runtime_instr diff -uprN linux-2.6.32/arch/s390/kernel/crash_dump.c linux-2.6.32.ovz/arch/s390/kernel/crash_dump.c --- linux-2.6.32/arch/s390/kernel/crash_dump.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/crash_dump.c 2015-03-10 13:23:04.000000000 -0700 @@ -0,0 +1,459 @@ +/* + * S390 kdump implementation + * + * Copyright IBM Corp. 2011 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PTR_ADD(x, y) (((char *) (x)) + ((unsigned long) (y))) +#define PTR_SUB(x, y) (((char *) (x)) - ((unsigned long) (y))) +#define PTR_DIFF(x, y) ((unsigned long)(((char *) (x)) - ((unsigned long) (y)))) + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * stores the size of elf header of crash image + */ +unsigned long long elfcorehdr_size; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + * + * Syntax: elfcorehdr=[size[KMG]@]offset[KMG] + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + if (*end == '@') { + elfcorehdr_size = elfcorehdr_addr; + elfcorehdr_addr = memparse(end + 1, &end); + } + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); + +/* + * Copy one page from "oldmem" + * + * For the kdump reserved memory this functions performs a swap operation: + * - [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] is mapped to [0 - OLDMEM_SIZE]. + * - [0 - OLDMEM_SIZE] is mapped to [OLDMEM_BASE - OLDMEM_BASE + OLDMEM_SIZE] + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, int userbuf) +{ + unsigned long src; + + if (!csize) + return 0; + + src = (pfn << PAGE_SHIFT) + offset; + if (src < OLDMEM_SIZE) + src += OLDMEM_BASE; + else if (src > OLDMEM_BASE && + src < OLDMEM_BASE + OLDMEM_SIZE) + src -= OLDMEM_BASE; + if (userbuf) + copy_to_user_real((void __user *) buf, (void *) src, csize); + else + memcpy_real(buf, (void *) src, csize); + return csize; +} + +/* + * Copy memory from old kernel + */ +static int copy_from_oldmem(void *dest, void *src, size_t count) +{ + unsigned long copied = 0; + int rc; + + if ((unsigned long) src < OLDMEM_SIZE) { + copied = min(count, OLDMEM_SIZE - (unsigned long) src); + rc = memcpy_real(dest, src + OLDMEM_BASE, copied); + if (rc) + return rc; + } + return memcpy_real(dest + copied, src + copied, count - copied); +} + +/* + * Alloc memory and panic in case of ENOMEM + */ +static void *kzalloc_panic(int len) +{ + void *rc; + + rc = kzalloc(len, GFP_KERNEL); + if (!rc) + panic("s390 kdump kzalloc (%d) failed", len); + return rc; +} + +/* + * Get memory layout and create hole for oldmem + */ +static struct mem_chunk *get_memory_layout(void) +{ + struct mem_chunk *chunk_array; + + chunk_array = kzalloc_panic(MEMORY_CHUNKS * sizeof(struct mem_chunk)); + detect_memory_layout(chunk_array); + create_mem_hole(chunk_array, OLDMEM_BASE, OLDMEM_SIZE, CHUNK_CRASHK); + return chunk_array; +} + +/* + * Initialize ELF note + */ +static void *nt_init(void *buf, Elf64_Word type, void *desc, int d_len, + const char *name) +{ + Elf64_Nhdr *note; + u64 len; + + note = (Elf64_Nhdr *)buf; + note->n_namesz = strlen(name) + 1; + note->n_descsz = d_len; + note->n_type = type; + len = sizeof(Elf64_Nhdr); + + memcpy(buf + len, name, note->n_namesz); + len = roundup(len + note->n_namesz, 4); + + memcpy(buf + len, desc, note->n_descsz); + len = roundup(len + note->n_descsz, 4); + + return PTR_ADD(buf, len); +} + +/* + * Initialize prstatus note + */ +static void *nt_prstatus(void *ptr, struct save_area_s390x *sa) +{ + struct elf_prstatus nt_prstatus; + static int cpu_nr = 1; + + memset(&nt_prstatus, 0, sizeof(nt_prstatus)); + memcpy(&nt_prstatus.pr_reg.gprs, sa->gp_regs, sizeof(sa->gp_regs)); + memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw)); + memcpy(&nt_prstatus.pr_reg.acrs, sa->acc_regs, sizeof(sa->acc_regs)); + nt_prstatus.pr_pid = cpu_nr; + cpu_nr++; + + return nt_init(ptr, NT_PRSTATUS, &nt_prstatus, sizeof(nt_prstatus), + "CORE"); +} + +/* + * Initialize fpregset (floating point) note + */ +static void *nt_fpregset(void *ptr, struct save_area_s390x *sa) +{ + elf_fpregset_t nt_fpregset; + + memset(&nt_fpregset, 0, sizeof(nt_fpregset)); + memcpy(&nt_fpregset.fpc, &sa->fp_ctrl_reg, sizeof(sa->fp_ctrl_reg)); + memcpy(&nt_fpregset.fprs, &sa->fp_regs, sizeof(sa->fp_regs)); + + return nt_init(ptr, NT_PRFPREG, &nt_fpregset, sizeof(nt_fpregset), + "CORE"); +} + +/* + * Initialize timer note + */ +static void *nt_s390_timer(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_TIMER, &sa->timer, sizeof(sa->timer), + KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize TOD clock comparator note + */ +static void *nt_s390_tod_cmp(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_TODCMP, &sa->clk_cmp, + sizeof(sa->clk_cmp), KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize TOD programmable register note + */ +static void *nt_s390_tod_preg(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_TODPREG, &sa->tod_reg, + sizeof(sa->tod_reg), KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize control register note + */ +static void *nt_s390_ctrs(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_CTRS, &sa->ctrl_regs, + sizeof(sa->ctrl_regs), KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize prefix register note + */ +static void *nt_s390_prefix(void *ptr, struct save_area_s390x *sa) +{ + return nt_init(ptr, NT_S390_PREFIX, &sa->pref_reg, + sizeof(sa->pref_reg), KEXEC_CORE_NOTE_NAME); +} + +/* + * Fill ELF notes for one CPU with save area registers + */ +void *fill_cpu_elf_notes(void *ptr, struct save_area_s390x *sa) +{ + ptr = nt_prstatus(ptr, sa); + ptr = nt_fpregset(ptr, sa); + ptr = nt_s390_timer(ptr, sa); + ptr = nt_s390_tod_cmp(ptr, sa); + ptr = nt_s390_tod_preg(ptr, sa); + ptr = nt_s390_ctrs(ptr, sa); + ptr = nt_s390_prefix(ptr, sa); + return ptr; +} + +/* + * Initialize prpsinfo note (new kernel) + */ +static void *nt_prpsinfo(void *ptr) +{ + struct elf_prpsinfo prpsinfo; + + memset(&prpsinfo, 0, sizeof(prpsinfo)); + prpsinfo.pr_sname = 'R'; + strcpy(prpsinfo.pr_fname, "vmlinux"); + return nt_init(ptr, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo), + KEXEC_CORE_NOTE_NAME); +} + +/* + * Initialize vmcoreinfo note (new kernel) + */ +static void *nt_vmcoreinfo(void *ptr) +{ + char nt_name[11], *vmcoreinfo; + Elf64_Nhdr note; + void *addr; + + if (copy_from_oldmem(&addr, &S390_lowcore.vmcore_info, sizeof(addr))) + return ptr; + memset(nt_name, 0, sizeof(nt_name)); + if (copy_from_oldmem(¬e, addr, sizeof(note))) + return ptr; + if (copy_from_oldmem(nt_name, addr + sizeof(note), sizeof(nt_name) - 1)) + return ptr; + if (strcmp(nt_name, "VMCOREINFO") != 0) + return ptr; + vmcoreinfo = kzalloc_panic(note.n_descsz + 1); + if (copy_from_oldmem(vmcoreinfo, addr + 24, note.n_descsz)) + return ptr; + vmcoreinfo[note.n_descsz + 1] = 0; + return nt_init(ptr, 0, vmcoreinfo, note.n_descsz, "VMCOREINFO"); +} + +/* + * Initialize ELF header (new kernel) + */ +static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) +{ + memset(ehdr, 0, sizeof(*ehdr)); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2MSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = EM_S390; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + ehdr->e_phnum = mem_chunk_cnt + 1; + return ehdr + 1; +} + +/* + * Return CPU count for ELF header (new kernel) + */ +static int get_cpu_cnt(void) +{ + int i, cpus = 0; + + for (i = 0; zfcpdump_save_areas[i]; i++) { + if (zfcpdump_save_areas[i]->s390x.pref_reg == 0) + continue; + cpus++; + } + return cpus; +} + +/* + * Return memory chunk count for ELF header (new kernel) + */ +static int get_mem_chunk_cnt(void) +{ + struct mem_chunk *chunk_array, *mem_chunk; + int i, cnt = 0; + + chunk_array = get_memory_layout(); + for (i = 0; i < MEMORY_CHUNKS; i++) { + mem_chunk = &chunk_array[i]; + if (chunk_array[i].type != CHUNK_READ_WRITE && + chunk_array[i].type != CHUNK_READ_ONLY) + continue; + if (mem_chunk->size == 0) + continue; + cnt++; + } + kfree(chunk_array); + return cnt; +} + +/* + * Relocate pointer in order to allow vmcore code access the data + */ +static inline unsigned long relocate(unsigned long addr) +{ + return OLDMEM_BASE + addr; +} + +/* + * Initialize ELF loads (new kernel) + */ +static int loads_init(Elf64_Phdr *phdr, u64 loads_offset) +{ + struct mem_chunk *chunk_array, *mem_chunk; + int i; + + chunk_array = get_memory_layout(); + for (i = 0; i < MEMORY_CHUNKS; i++) { + mem_chunk = &chunk_array[i]; + if (mem_chunk->size == 0) + break; + if (chunk_array[i].type != CHUNK_READ_WRITE && + chunk_array[i].type != CHUNK_READ_ONLY) + continue; + else + phdr->p_filesz = mem_chunk->size; + phdr->p_type = PT_LOAD; + phdr->p_offset = mem_chunk->addr; + phdr->p_vaddr = mem_chunk->addr; + phdr->p_paddr = mem_chunk->addr; + phdr->p_memsz = mem_chunk->size; + phdr->p_flags = PF_R | PF_W | PF_X; + phdr->p_align = PAGE_SIZE; + phdr++; + } + kfree(chunk_array); + return i; +} + +/* + * Initialize notes (new kernel) + */ +static void *notes_init(Elf64_Phdr *phdr, void *ptr, u64 notes_offset) +{ + struct save_area_s390x *sa; + void *ptr_start = ptr; + int i; + + ptr = nt_prpsinfo(ptr); + + for (i = 0; zfcpdump_save_areas[i]; i++) { + sa = &zfcpdump_save_areas[i]->s390x; + if (sa->pref_reg == 0) + continue; + ptr = fill_cpu_elf_notes(ptr, sa); + } + ptr = nt_vmcoreinfo(ptr); + memset(phdr, 0, sizeof(*phdr)); + phdr->p_type = PT_NOTE; + phdr->p_offset = relocate(notes_offset); + phdr->p_filesz = (unsigned long) PTR_SUB(ptr, ptr_start); + phdr->p_memsz = phdr->p_filesz; + return ptr; +} + +/* + * Create ELF core header (new kernel) + */ +static void s390_elf_corehdr_create(char **elfcorebuf, size_t *elfcorebuf_sz) +{ + Elf64_Phdr *phdr_notes, *phdr_loads; + int mem_chunk_cnt; + void *ptr, *hdr; + u32 alloc_size; + u64 hdr_off; + + mem_chunk_cnt = get_mem_chunk_cnt(); + + alloc_size = 0x1000 + get_cpu_cnt() * 0x300 + + mem_chunk_cnt * sizeof(Elf64_Phdr); + hdr = kzalloc_panic(alloc_size); + /* Init elf header */ + ptr = ehdr_init(hdr, mem_chunk_cnt); + /* Init program headers */ + phdr_notes = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr)); + phdr_loads = ptr; + ptr = PTR_ADD(ptr, sizeof(Elf64_Phdr) * mem_chunk_cnt); + /* Init notes */ + hdr_off = PTR_DIFF(ptr, hdr); + ptr = notes_init(phdr_notes, ptr, ((unsigned long) hdr) + hdr_off); + /* Init loads */ + hdr_off = PTR_DIFF(ptr, hdr); + loads_init(phdr_loads, ((unsigned long) hdr) + hdr_off); + *elfcorebuf_sz = hdr_off; + *elfcorebuf = (void *) relocate((unsigned long) hdr); + BUG_ON(*elfcorebuf_sz > alloc_size); +} + +/* + * Create kdump ELF core header in new kernel, if it has not been passed via + * the "elfcorehdr" kernel parameter + */ +static int setup_kdump_elfcorehdr(void) +{ + size_t elfcorebuf_sz; + char *elfcorebuf; + + if (!OLDMEM_BASE || is_kdump_kernel()) + return -EINVAL; + s390_elf_corehdr_create(&elfcorebuf, &elfcorebuf_sz); + elfcorehdr_addr = (unsigned long long) elfcorebuf; + elfcorehdr_size = elfcorebuf_sz; + return 0; +} + +subsys_initcall(setup_kdump_elfcorehdr); diff -uprN linux-2.6.32/arch/s390/kernel/debug.c linux-2.6.32.ovz/arch/s390/kernel/debug.c --- linux-2.6.32/arch/s390/kernel/debug.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/debug.c 2015-03-10 13:23:23.000000000 -0700 @@ -2,8 +2,8 @@ * arch/s390/kernel/debug.c * S/390 debug facility * - * Copyright (C) 1999, 2000 IBM Deutschland Entwicklung GmbH, - * IBM Corporation + * Copyright IBM Corp. 1999, 2012 + * * Author(s): Michael Holzheu (holzheu@de.ibm.com), * Holger Smolinski (Holger.Smolinski@de.ibm.com) * @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -166,6 +167,7 @@ static debug_info_t *debug_area_last = N static DEFINE_MUTEX(debug_mutex); static int initialized; +static int debug_critical; static const struct file_operations debug_file_ops = { .owner = THIS_MODULE, @@ -934,6 +936,11 @@ debug_stop_all(void) } +void debug_set_critical(void) +{ + debug_critical = 1; +} + /* * debug_event_common: * - write debug entry with given size @@ -947,7 +954,11 @@ debug_event_common(debug_info_t * id, in if (!debug_active || !id->areas) return NULL; - spin_lock_irqsave(&id->lock, flags); + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else + spin_lock_irqsave(&id->lock, flags); active = get_active_entry(id); memset(DEBUG_DATA(active), 0, id->buf_size); memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); @@ -970,7 +981,11 @@ debug_entry_t if (!debug_active || !id->areas) return NULL; - spin_lock_irqsave(&id->lock, flags); + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else + spin_lock_irqsave(&id->lock, flags); active = get_active_entry(id); memset(DEBUG_DATA(active), 0, id->buf_size); memcpy(DEBUG_DATA(active), buf, min(len, id->buf_size)); @@ -1015,7 +1030,11 @@ debug_sprintf_event(debug_info_t* id, in return NULL; numargs=debug_count_numargs(string); - spin_lock_irqsave(&id->lock, flags); + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else + spin_lock_irqsave(&id->lock, flags); active = get_active_entry(id); curr_event=(debug_sprintf_entry_t *) DEBUG_DATA(active); va_start(ap,string); @@ -1049,7 +1068,11 @@ debug_sprintf_exception(debug_info_t* id numargs=debug_count_numargs(string); - spin_lock_irqsave(&id->lock, flags); + if (debug_critical) { + if (!spin_trylock_irqsave(&id->lock, flags)) + return NULL; + } else + spin_lock_irqsave(&id->lock, flags); active = get_active_entry(id); curr_event=(debug_sprintf_entry_t *)DEBUG_DATA(active); va_start(ap,string); @@ -1183,7 +1206,7 @@ debug_get_uint(char *buf) { int rc; - for(; isspace(*buf); buf++); + buf = skip_spaces(buf); rc = simple_strtoul(buf, &buf, 10); if(*buf){ rc = -EINVAL; @@ -1430,10 +1453,10 @@ debug_hex_ascii_format_fn(debug_info_t * rc += sprintf(out_buf + rc, "| "); for (i = 0; i < id->buf_size; i++) { unsigned char c = in_buf[i]; - if (!isprint(c)) - rc += sprintf(out_buf + rc, "."); - else + if (isascii(c) && isprint(c)) rc += sprintf(out_buf + rc, "%c", c); + else + rc += sprintf(out_buf + rc, "."); } rc += sprintf(out_buf + rc, "\n"); return rc; diff -uprN linux-2.6.32/arch/s390/kernel/diag.c linux-2.6.32.ovz/arch/s390/kernel/diag.c --- linux-2.6.32/arch/s390/kernel/diag.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/diag.c 2015-03-10 13:22:27.000000000 -0700 @@ -13,18 +13,11 @@ */ void diag10(unsigned long addr) { - if (addr >= 0x7ff00000) - return; asm volatile( -#ifdef CONFIG_64BIT - " sam31\n" - " diag %0,%0,0x10\n" - "0: sam64\n" -#else - " diag %0,%0,0x10\n" - "0:\n" -#endif - EX_TABLE(0b, 0b) + "0: diag %0,%0,0x10\n" + "1:\n" + EX_TABLE(0b, 1b) + EX_TABLE(1b, 1b) : : "a" (addr)); } EXPORT_SYMBOL(diag10); diff -uprN linux-2.6.32/arch/s390/kernel/dis.c linux-2.6.32.ovz/arch/s390/kernel/dis.c --- linux-2.6.32/arch/s390/kernel/dis.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/dis.c 2015-03-10 13:23:48.000000000 -0700 @@ -90,6 +90,7 @@ enum { U8_16, /* 8 bit unsigned value starting at 16 */ I16_16, /* 16 bit signed value starting at 16 */ U16_16, /* 16 bit unsigned value starting at 16 */ + U16_32, /* 32 bit unsigned value starting at 16 */ J16_16, /* PC relative jump offset at 16 */ J32_16, /* PC relative long offset at 16 */ I32_16, /* 32 bit signed value starting at 16 */ @@ -116,7 +117,8 @@ enum { INSTR_RSY_RURD, INSTR_RS_AARD, INSTR_RS_CCRD, INSTR_RS_R0RD, INSTR_RS_RRRD, INSTR_RS_RURD, INSTR_RXE_FRRD, INSTR_RXE_RRRD, INSTR_RXF_FRRDF, INSTR_RXY_FRRD, INSTR_RXY_RRRD, INSTR_RX_FRRD, - INSTR_RX_RRRD, INSTR_RX_URRD, INSTR_SIY_URD, INSTR_SI_URD, + INSTR_RX_RRRD, INSTR_RX_URRD, INSTR_SIL_RDU, + INSTR_SIY_URD, INSTR_SI_URD, INSTR_SSE_RDRD, INSTR_SSF_RRDRD, INSTR_SS_L0RDRD, INSTR_SS_LIRDRD, INSTR_SS_LLRDRD, INSTR_SS_RRRDRD, INSTR_SS_RRRDRD2, INSTR_SS_RRRDRD3, INSTR_S_00, INSTR_S_RD, @@ -174,6 +176,7 @@ static const struct operand operands[] = [U8_16] = { 8, 16, 0 }, [I16_16] = { 16, 16, OPERAND_SIGNED }, [U16_16] = { 16, 16, 0 }, + [U16_32] = { 16, 32, 0 }, [J16_16] = { 16, 16, OPERAND_PCREL }, [J32_16] = { 32, 16, OPERAND_PCREL }, [I32_16] = { 32, 16, OPERAND_SIGNED }, @@ -240,6 +243,7 @@ static const unsigned char formats[][7] [INSTR_RX_FRRD] = { 0xff, F_8,D_20,X_12,B_16,0,0 }, /* e.g. ae */ [INSTR_RX_RRRD] = { 0xff, R_8,D_20,X_12,B_16,0,0 }, /* e.g. l */ [INSTR_RX_URRD] = { 0xff, U4_8,D_20,X_12,B_16,0,0 }, /* e.g. bc */ + [INSTR_SIL_RDU] = { 0xff, D_20,B_16,U16_32,0,0,0 }, /* e.g. tbegin*/ [INSTR_SI_URD] = { 0xff, D_20,B_16,U8_8,0,0,0 }, /* e.g. cli */ [INSTR_SIY_URD] = { 0xff, D20_20,B_16,U8_8,0,0,0 }, /* e.g. tmy */ [INSTR_SSE_RDRD] = { 0xff, D_20,B_16,D_36,B_32,0,0 }, /* e.g. mvsdk */ @@ -261,6 +265,18 @@ static const unsigned char formats[][7] /* e.g. mvcos */ }; +enum { + LONG_INSN_TABORT, + LONG_INSN_TBEGIN, + LONG_INSN_TBEGINC, +}; + +static char *long_insn_name[] = { + [LONG_INSN_TABORT] = "tabort", + [LONG_INSN_TBEGIN] = "tbegin", + [LONG_INSN_TBEGINC] = "tbeginc", +}; + static struct insn opcode[] = { #ifdef CONFIG_64BIT { "lmd", 0xef, INSTR_SS_RRRDRD3 }, @@ -519,6 +535,9 @@ static struct insn opcode_b2[] = { { "cutfu", 0xa7, INSTR_RRF_M0RR }, { "stfle", 0xb0, INSTR_S_RD }, { "lpswe", 0xb2, INSTR_S_RD }, + { "etndg", 0xec, INSTR_RRE_R0 }, + { { 0, LONG_INSN_TABORT }, 0xfc, INSTR_S_RD }, + { "tend", 0xf8, INSTR_S_RD }, #endif { "stidp", 0x02, INSTR_S_RD }, { "sck", 0x04, INSTR_S_RD }, @@ -917,6 +936,7 @@ static struct insn opcode_e3[] = { { "llgh", 0x91, INSTR_RXY_RRRD }, { "llc", 0x94, INSTR_RXY_RRRD }, { "llh", 0x95, INSTR_RXY_RRRD }, + { "ntstg", 0x25, INSTR_RXY_RRRD }, #endif { "lrv", 0x1e, INSTR_RXY_RRRD }, { "lrvh", 0x1f, INSTR_RXY_RRRD }, @@ -931,6 +951,8 @@ static struct insn opcode_e3[] = { static struct insn opcode_e5[] = { #ifdef CONFIG_64BIT { "strag", 0x02, INSTR_SSE_RDRD }, + { { 0, LONG_INSN_TBEGIN }, 0x60, INSTR_SIL_RDU }, + { { 0, LONG_INSN_TBEGINC }, 0x61, INSTR_SIL_RDU }, #endif { "lasp", 0x00, INSTR_SSE_RDRD }, { "tprot", 0x01, INSTR_SSE_RDRD }, @@ -1166,7 +1188,11 @@ static int print_insn(char *buffer, unsi ptr = buffer; insn = find_insn(code); if (insn) { - ptr += sprintf(ptr, "%.5s\t", insn->name); + if (insn->name[0] == '\0') + ptr += sprintf(ptr, "%s\t", + long_insn_name[(int) insn->name[1]]); + else + ptr += sprintf(ptr, "%.5s\t", insn->name); /* Extract the operands. */ separator = 0; for (ops = formats[insn->format] + 1, i = 0; diff -uprN linux-2.6.32/arch/s390/kernel/early.c linux-2.6.32.ovz/arch/s390/kernel/early.c --- linux-2.6.32/arch/s390/kernel/early.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/early.c 2015-03-10 13:23:48.000000000 -0700 @@ -82,7 +82,8 @@ asm( " lm 6,15,24(15)\n" #endif " br 14\n" - " .size savesys_ipl_nss, .-savesys_ipl_nss\n"); + " .size savesys_ipl_nss, .-savesys_ipl_nss\n" + " .previous\n"); static __initdata char upper_command_line[COMMAND_LINE_SIZE]; @@ -227,7 +228,7 @@ static noinline __init void detect_machi S390_lowcore.machine_flags |= MACHINE_FLAG_VM; } -static __init void early_pgm_check_handler(void) +static void early_pgm_check_handler(void) { unsigned long addr; const struct exception_table_entry *fixup; @@ -352,6 +353,7 @@ static __init void detect_machine_facili { #ifdef CONFIG_64BIT unsigned int facilities; + unsigned long long facility_bits; facilities = stfl(); if (facilities & (1 << 28)) @@ -360,23 +362,28 @@ static __init void detect_machine_facili S390_lowcore.machine_flags |= MACHINE_FLAG_PFMF; if (facilities & (1 << 4)) S390_lowcore.machine_flags |= MACHINE_FLAG_MVCOS; + if ((stfle(&facility_bits, 1) > 0) && + (facility_bits & (1ULL << (63 - 40)))) + S390_lowcore.machine_flags |= MACHINE_FLAG_SPP; #endif } static __init void rescue_initrd(void) { #ifdef CONFIG_BLK_DEV_INITRD + unsigned long min_initrd_addr = (unsigned long) _end + (4UL << 20); /* - * Move the initrd right behind the bss section in case it starts - * within the bss section. So we don't overwrite it when the bss - * section gets cleared. + * Just like in case of IPL from VM reader we make sure there is a + * gap of 4MB between end of kernel and start of initrd. + * That way we can also be sure that saving an NSS will succeed, + * which however only requires different segments. */ if (!INITRD_START || !INITRD_SIZE) return; - if (INITRD_START >= (unsigned long) __bss_stop) + if (INITRD_START >= min_initrd_addr) return; - memmove(__bss_stop, (void *) INITRD_START, INITRD_SIZE); - INITRD_START = (unsigned long) __bss_stop; + memmove((void *) min_initrd_addr, (void *) INITRD_START, INITRD_SIZE); + INITRD_START = min_initrd_addr; #endif } @@ -412,6 +419,20 @@ static void __init setup_boot_command_li append_to_cmdline(append_ipl_scpdata); } +static __init void setup_transactional_execution(void) +{ +#ifdef CONFIG_64BIT + unsigned long long facility_bits[2]; + + if (stfle(facility_bits, 2) <= 1) + return; + if (!(facility_bits[0] & (1ULL << (63 - 50))) || + !(facility_bits[1] & (1ULL << (127 - 73)))) + return; + S390_lowcore.machine_flags |= MACHINE_FLAG_TE; +#endif +} + /* * Save ipl parameters, clear bss memory, initialize storage keys @@ -439,6 +460,7 @@ void __init startup_init(void) detect_diag44(); detect_machine_facilities(); setup_hpage(); + setup_transactional_execution(); sclp_facilities_detect(); detect_memory_layout(memory_chunk); #ifdef CONFIG_DYNAMIC_FTRACE diff -uprN linux-2.6.32/arch/s390/kernel/entry64.S linux-2.6.32.ovz/arch/s390/kernel/entry64.S --- linux-2.6.32/arch/s390/kernel/entry64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/entry64.S 2015-03-10 13:23:48.000000000 -0700 @@ -2,7 +2,7 @@ * arch/s390/kernel/entry64.S * S390 low-level entry points. * - * Copyright (C) IBM Corp. 1999,2006 + * Copyright (C) IBM Corp. 1999,2010 * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), * Hartmut Penner (hp@de.ibm.com), * Denis Joseph Barrow (djbarrow@de.ibm.com,barrow_dj@yahoo.com), @@ -61,30 +61,45 @@ _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | #define BASED(name) name-system_call(%r13) + .macro HANDLE_SIE_INTERCEPT +#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) + lg %r3,__LC_SIE_HOOK + ltgr %r3,%r3 + jz 0f + basr %r14,%r3 + 0: +#endif + .endm + #ifdef CONFIG_TRACE_IRQFLAGS .macro TRACE_IRQS_ON - basr %r2,%r0 - brasl %r14,trace_hardirqs_on_caller + basr %r2,%r0 + brasl %r14,trace_hardirqs_on_caller .endm .macro TRACE_IRQS_OFF - basr %r2,%r0 - brasl %r14,trace_hardirqs_off_caller + basr %r2,%r0 + brasl %r14,trace_hardirqs_off_caller .endm - .macro TRACE_IRQS_CHECK - basr %r2,%r0 + .macro TRACE_IRQS_CHECK_ON tm SP_PSW(%r15),0x03 # irqs enabled? jz 0f - brasl %r14,trace_hardirqs_on_caller - j 1f -0: brasl %r14,trace_hardirqs_off_caller -1: + TRACE_IRQS_ON +0: + .endm + + .macro TRACE_IRQS_CHECK_OFF + tm SP_PSW(%r15),0x03 # irqs enabled? + jz 0f + TRACE_IRQS_OFF +0: .endm #else #define TRACE_IRQS_ON #define TRACE_IRQS_OFF -#define TRACE_IRQS_CHECK +#define TRACE_IRQS_CHECK_ON +#define TRACE_IRQS_CHECK_OFF #endif #ifdef CONFIG_LOCKDEP @@ -113,31 +128,35 @@ _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | * R15 - kernel stack pointer */ - .macro SAVE_ALL_BASE savearea - stmg %r12,%r15,\savearea - larl %r13,system_call - .endm - .macro SAVE_ALL_SVC psworg,savearea - la %r12,\psworg + stmg %r11,%r15,\savearea lg %r15,__LC_KERNEL_STACK # problem state -> load ksp + aghi %r15,-SP_SIZE # make room for registers & psw + lg %r11,__LC_LAST_BREAK .endm - .macro SAVE_ALL_SYNC psworg,savearea - la %r12,\psworg + .macro SAVE_ALL_PGM psworg,savearea + stmg %r11,%r15,\savearea tm \psworg+1,0x01 # test problem state bit - jz 2f # skip stack setup save - lg %r15,__LC_KERNEL_STACK # problem state -> load ksp #ifdef CONFIG_CHECK_STACK - j 3f -2: tml %r15,STACK_SIZE - CONFIG_STACK_GUARD - jz stack_overflow -3: + jnz 1f + tml %r15,STACK_SIZE - CONFIG_STACK_GUARD + jnz 2f + la %r12,\psworg + j stack_overflow +#else + jz 2f #endif -2: +1: lg %r15,__LC_KERNEL_STACK # problem state -> load ksp +2: aghi %r15,-SP_SIZE # make room for registers & psw + larl %r13,system_call + lg %r11,__LC_LAST_BREAK .endm .macro SAVE_ALL_ASYNC psworg,savearea + stmg %r11,%r15,\savearea + larl %r13,system_call + lg %r11,__LC_LAST_BREAK la %r12,\psworg tm \psworg+1,0x01 # test problem state bit jnz 1f # from user -> load kernel stack @@ -151,27 +170,23 @@ _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | 0: lg %r14,__LC_ASYNC_STACK # are we already on the async. stack ? slgr %r14,%r15 srag %r14,%r14,STACK_SHIFT - jz 2f -1: lg %r15,__LC_ASYNC_STACK # load async stack #ifdef CONFIG_CHECK_STACK - j 3f -2: tml %r15,STACK_SIZE - CONFIG_STACK_GUARD - jz stack_overflow -3: + jnz 1f + tml %r15,STACK_SIZE - CONFIG_STACK_GUARD + jnz 2f + j stack_overflow +#else + jz 2f #endif -2: +1: lg %r15,__LC_ASYNC_STACK # load async stack +2: aghi %r15,-SP_SIZE # make room for registers & psw .endm - .macro CREATE_STACK_FRAME psworg,savearea - aghi %r15,-SP_SIZE # make room for registers & psw - mvc SP_PSW(16,%r15),0(%r12) # move user PSW to stack + .macro CREATE_STACK_FRAME savearea + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stg %r2,SP_ORIG_R2(%r15) # store original content of gpr 2 - icm %r12,3,__LC_SVC_ILC - stmg %r0,%r11,SP_R0(%r15) # store gprs %r0-%r11 to kernel stack - st %r12,SP_SVCNR(%r15) - mvc SP_R12(32,%r15),\savearea # move %r12-%r15 to stack - la %r12,0 - stg %r12,__SF_BACKCHAIN(%r15) + mvc SP_R11(40,%r15),\savearea # move %r11-%r15 to stack + stmg %r0,%r10,SP_R0(%r15) # store gprs %r0-%r10 to kernel stack .endm .macro RESTORE_ALL psworg,sync @@ -187,6 +202,13 @@ _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | lpswe \psworg # back to caller .endm + .macro LAST_BREAK + srag %r10,%r11,23 + jz 0f + stg %r11,__TI_last_break(%r12) +0: + .endm + /* * Scheduler resume function, called by switch_to * gpr2 = (task_struct *) prev @@ -196,13 +218,6 @@ _TIF_SYSCALL = (_TIF_SYSCALL_TRACE>>8 | */ .globl __switch_to __switch_to: - tm __THREAD_per+4(%r3),0xe8 # is the new process using per ? - jz __switch_to_noper # if not we're fine - stctg %c9,%c11,__SF_EMPTY(%r15)# We are using per stuff - clc __THREAD_per(24,%r3),__SF_EMPTY(%r15) - je __switch_to_noper # we got away without bashing TLB's - lctlg %c9,%c11,__THREAD_per(%r3) # Nope we didn't -__switch_to_noper: lg %r4,__THREAD_info(%r2) # get thread_info of prev tm __TI_flags+7(%r4),_TIF_MCCK_PENDING # machine check pending? jz __switch_to_no_mcck @@ -216,6 +231,7 @@ __switch_to_no_mcck: lmg %r6,%r15,__SF_GPRS(%r15)# load __switch_to registers of next task stg %r3,__LC_CURRENT # __LC_CURRENT = current task struct lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 + mvc __LC_CURRENT_PID+4(4,%r0),__TASK_pid(%r3) # store pid of next lg %r3,__THREAD_info(%r3) # load thread_info from task struct stg %r3,__LC_THREAD_INFO aghi %r3,STACK_SIZE @@ -232,143 +248,130 @@ __critical_start: system_call: stpt __LC_SYNC_ENTER_TIMER sysc_saveall: - SAVE_ALL_BASE __LC_SAVE_AREA SAVE_ALL_SVC __LC_SVC_OLD_PSW,__LC_SAVE_AREA - CREATE_STACK_FRAME __LC_SVC_OLD_PSW,__LC_SAVE_AREA - llgh %r7,__LC_SVC_INT_CODE # get svc number from lowcore + CREATE_STACK_FRAME __LC_SAVE_AREA + mvc SP_PSW(16,%r15),__LC_SVC_OLD_PSW + mvc SP_SVCNR(2,%r15),__LC_SVC_INT_CODE + mvc SP_ILC(2,%r15),__LC_SVC_ILC + stg %r7,SP_ARGS(%r15) + lg %r12,__LC_THREAD_INFO # load pointer to thread_info struct sysc_vtime: UPDATE_VTIME __LC_EXIT_TIMER,__LC_SYNC_ENTER_TIMER,__LC_USER_TIMER sysc_stime: UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER sysc_update: mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER + LAST_BREAK sysc_do_svc: - lg %r9,__LC_THREAD_INFO # load pointer to thread_info struct - ltgr %r7,%r7 # test for svc 0 + llgh %r7,SP_SVCNR(%r15) + slag %r7,%r7,2 # shift and test for svc 0 jnz sysc_nr_ok # svc 0: system call number in %r1 - cl %r1,BASED(.Lnr_syscalls) + llgfr %r1,%r1 # clear high word in r1 + cghi %r1,NR_syscalls jnl sysc_nr_ok - lgfr %r7,%r1 # clear high word in r1 + sth %r1,SP_SVCNR(%r15) + slag %r7,%r1,2 # shift and test for svc 0 sysc_nr_ok: - mvc SP_ARGS(8,%r15),SP_R7(%r15) -sysc_do_restart: - sth %r7,SP_SVCNR(%r15) - sllg %r7,%r7,2 # svc number * 4 larl %r10,sys_call_table #ifdef CONFIG_COMPAT - tm __TI_flags+5(%r9),(_TIF_31BIT>>16) # running in 31 bit mode ? + tm __TI_flags+5(%r12),(_TIF_31BIT>>16) # running in 31 bit mode ? jno sysc_noemu larl %r10,sys_call_table_emu # use 31 bit emulation system calls sysc_noemu: #endif - tm __TI_flags+6(%r9),_TIF_SYSCALL + tm __TI_flags+6(%r12),_TIF_SYSCALL lgf %r8,0(%r7,%r10) # load address of system call routine jnz sysc_tracesys basr %r14,%r8 # call sys_xxxx stg %r2,SP_R2(%r15) # store return value (change R2 on stack) sysc_return: - tm __TI_flags+7(%r9),_TIF_WORK_SVC + LOCKDEP_SYS_EXIT +sysc_tif: + tm __TI_flags+7(%r12),_TIF_WORK_SVC jnz sysc_work # there is work to do (signals etc.) sysc_restore: -#ifdef CONFIG_TRACE_IRQFLAGS - larl %r1,sysc_restore_trace_psw - lpswe 0(%r1) -sysc_restore_trace: - TRACE_IRQS_CHECK - LOCKDEP_SYS_EXIT -#endif -sysc_leave: RESTORE_ALL __LC_RETURN_PSW,1 sysc_done: -#ifdef CONFIG_TRACE_IRQFLAGS - .section .data,"aw",@progbits - .align 8 - .globl sysc_restore_trace_psw -sysc_restore_trace_psw: - .quad 0, sysc_restore_trace - .previous -#endif - -# -# recheck if there is more work to do # -sysc_work_loop: - tm __TI_flags+7(%r9),_TIF_WORK_SVC - jz sysc_restore # there is no work to do -# -# One of the work bits is on. Find out which one. +# There is work to do, but first we need to check if we return to userspace. # sysc_work: tm SP_PSW+1(%r15),0x01 # returning to user ? jno sysc_restore - tm __TI_flags+7(%r9),_TIF_MCCK_PENDING + +# +# One of the work bits is on. Find out which one. +# +sysc_work_tif: + tm __TI_flags+7(%r12),_TIF_MCCK_PENDING jo sysc_mcck_pending - tm __TI_flags+7(%r9),_TIF_NEED_RESCHED + tm __TI_flags+7(%r12),_TIF_NEED_RESCHED jo sysc_reschedule - tm __TI_flags+7(%r9),_TIF_SIGPENDING - jnz sysc_sigpending - tm __TI_flags+7(%r9),_TIF_NOTIFY_RESUME - jnz sysc_notify_resume - tm __TI_flags+7(%r9),_TIF_RESTART_SVC + tm __TI_flags+7(%r12),_TIF_SIGPENDING + jo sysc_sigpending + tm __TI_flags+7(%r12),_TIF_NOTIFY_RESUME + jo sysc_notify_resume + tm __TI_flags+7(%r12),_TIF_RESTART_SVC jo sysc_restart - tm __TI_flags+7(%r9),_TIF_SINGLE_STEP + tm __TI_flags+7(%r12),_TIF_SINGLE_STEP jo sysc_singlestep - j sysc_restore -sysc_work_done: + j sysc_return # beware of critical section cleanup # # _TIF_NEED_RESCHED is set, call schedule # sysc_reschedule: - larl %r14,sysc_work_loop - jg schedule # return point is sysc_return + larl %r14,sysc_return + jg schedule # return point is sysc_return # # _TIF_MCCK_PENDING is set, call handler # sysc_mcck_pending: - larl %r14,sysc_work_loop + larl %r14,sysc_return jg s390_handle_mcck # TIF bit will be cleared by handler # # _TIF_SIGPENDING is set, call do_signal # sysc_sigpending: - ni __TI_flags+7(%r9),255-_TIF_SINGLE_STEP # clear TIF_SINGLE_STEP + ni __TI_flags+7(%r12),255-_TIF_SINGLE_STEP # clear TIF_SINGLE_STEP la %r2,SP_PTREGS(%r15) # load pt_regs brasl %r14,do_signal # call do_signal - tm __TI_flags+7(%r9),_TIF_RESTART_SVC + tm __TI_flags+7(%r12),_TIF_RESTART_SVC jo sysc_restart - tm __TI_flags+7(%r9),_TIF_SINGLE_STEP + tm __TI_flags+7(%r12),_TIF_SINGLE_STEP jo sysc_singlestep - j sysc_work_loop + j sysc_return # # _TIF_NOTIFY_RESUME is set, call do_notify_resume # sysc_notify_resume: la %r2,SP_PTREGS(%r15) # load pt_regs - larl %r14,sysc_work_loop + larl %r14,sysc_return jg do_notify_resume # call do_notify_resume # # _TIF_RESTART_SVC is set, set up registers and restart svc # sysc_restart: - ni __TI_flags+7(%r9),255-_TIF_RESTART_SVC # clear TIF_RESTART_SVC + ni __TI_flags+7(%r12),255-_TIF_RESTART_SVC # clear TIF_RESTART_SVC lg %r7,SP_R2(%r15) # load new svc number mvc SP_R2(8,%r15),SP_ORIG_R2(%r15) # restore first argument lmg %r2,%r6,SP_R2(%r15) # load svc arguments - j sysc_do_restart # restart svc + sth %r7,SP_SVCNR(%r15) + slag %r7,%r7,2 + j sysc_nr_ok # restart svc # # _TIF_SINGLE_STEP is set, call do_single_step # sysc_singlestep: - ni __TI_flags+7(%r9),255-_TIF_SINGLE_STEP # clear TIF_SINGLE_STEP + ni __TI_flags+7(%r12),255-_TIF_SINGLE_STEP # clear TIF_SINGLE_STEP xc SP_SVCNR(2,%r15),SP_SVCNR(%r15) # clear svc number la %r2,SP_PTREGS(%r15) # address of register-save area larl %r14,sysc_return # load adr. of system return @@ -381,8 +384,8 @@ sysc_singlestep: sysc_tracesys: la %r2,SP_PTREGS(%r15) # load pt_regs la %r3,0 - srl %r7,2 - stg %r7,SP_R2(%r15) + llgh %r0,SP_SVCNR(%r15) + stg %r0,SP_R2(%r15) brasl %r14,do_syscall_trace_enter lghi %r0,NR_syscalls clgr %r0,%r2 @@ -395,7 +398,7 @@ sysc_tracego: basr %r14,%r8 # call sys_xxx stg %r2,SP_R2(%r15) # store return value sysc_tracenogo: - tm __TI_flags+6(%r9),_TIF_SYSCALL + tm __TI_flags+6(%r12),_TIF_SYSCALL jz sysc_return la %r2,SP_PTREGS(%r15) # load pt_regs larl %r14,sysc_return # return point is sysc_return @@ -407,7 +410,7 @@ sysc_tracenogo: .globl ret_from_fork ret_from_fork: lg %r13,__LC_SVC_NEW_PSW+8 - lg %r9,__LC_THREAD_INFO # load pointer to thread_info struct + lg %r12,__LC_THREAD_INFO # load pointer to thread_info struct tm SP_PSW+1(%r15),0x01 # forking a kernel thread ? jo 0f stg %r15,SP_R15(%r15) # store stack pointer for new kthread @@ -437,12 +440,14 @@ kernel_execve: br %r14 # execve succeeded. 0: stnsm __SF_EMPTY(%r15),0xfc # disable interrupts +# TRACE_IRQS_OFF lg %r15,__LC_KERNEL_STACK # load ksp aghi %r15,-SP_SIZE # make room for registers & psw lg %r13,__LC_SVC_NEW_PSW+8 - lg %r9,__LC_THREAD_INFO mvc SP_PTREGS(__PT_SIZE,%r15),0(%r12) # copy pt_regs + lg %r12,__LC_THREAD_INFO xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) +# TRACE_IRQS_ON stosm __SF_EMPTY(%r15),0x03 # reenable interrupts brasl %r14,execve_tail j sysc_return @@ -467,20 +472,28 @@ pgm_check_handler: * for LPSW?). */ stpt __LC_SYNC_ENTER_TIMER - SAVE_ALL_BASE __LC_SAVE_AREA tm __LC_PGM_INT_CODE+1,0x80 # check whether we got a per exception jnz pgm_per # got per exception -> special case - SAVE_ALL_SYNC __LC_PGM_OLD_PSW,__LC_SAVE_AREA - CREATE_STACK_FRAME __LC_PGM_OLD_PSW,__LC_SAVE_AREA + SAVE_ALL_PGM __LC_PGM_OLD_PSW,__LC_SAVE_AREA + CREATE_STACK_FRAME __LC_SAVE_AREA + xc SP_SVCNR(4,%r15),SP_SVCNR(%r15) + mvc SP_PSW(16,%r15),__LC_PGM_OLD_PSW + lg %r12,__LC_THREAD_INFO # load pointer to thread_info struct tm SP_PSW+1(%r15),0x01 # interrupting from user ? jz pgm_no_vtime UPDATE_VTIME __LC_EXIT_TIMER,__LC_SYNC_ENTER_TIMER,__LC_USER_TIMER UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER + LAST_BREAK + lg %r14,__TI_task(%r12) + lghi %r13,__LC_PGM_TDB + tm __LC_PGM_ILC+2,0x02 # check for transaction abort + jz pgm_no_vtime + mvc __THREAD_trap_tdb(256,%r14),0(%r13) pgm_no_vtime: - lg %r9,__LC_THREAD_INFO # load pointer to thread_info struct - mvc SP_ARGS(8,%r15),__LC_LAST_BREAK - TRACE_IRQS_OFF + HANDLE_SIE_INTERCEPT + TRACE_IRQS_CHECK_OFF + stg %r11,SP_ARGS(%r15) lgf %r3,__LC_PGM_ILC # load program interruption code lghi %r8,0x7f ngr %r8,%r3 @@ -489,8 +502,10 @@ pgm_do_call: larl %r1,pgm_check_table lg %r1,0(%r8,%r1) # load address of handler routine la %r2,SP_PTREGS(%r15) # address of register-save area - larl %r14,sysc_return - br %r1 # branch to interrupt-handler + basr %r14,%r1 # branch to interrupt-handler +pgm_exit: + TRACE_IRQS_CHECK_ON + j sysc_return # # handle per exception @@ -502,54 +517,74 @@ pgm_per: clc __LC_PGM_OLD_PSW(16),__LC_SVC_NEW_PSW je pgm_svcper # no interesting special case, ignore PER event - lmg %r12,%r15,__LC_SAVE_AREA lpswe __LC_PGM_OLD_PSW # # Normal per exception # pgm_per_std: - SAVE_ALL_SYNC __LC_PGM_OLD_PSW,__LC_SAVE_AREA - CREATE_STACK_FRAME __LC_PGM_OLD_PSW,__LC_SAVE_AREA + SAVE_ALL_PGM __LC_PGM_OLD_PSW,__LC_SAVE_AREA + CREATE_STACK_FRAME __LC_SAVE_AREA + mvc SP_PSW(16,%r15),__LC_PGM_OLD_PSW + lg %r12,__LC_THREAD_INFO # load pointer to thread_info struct tm SP_PSW+1(%r15),0x01 # interrupting from user ? jz pgm_no_vtime2 UPDATE_VTIME __LC_EXIT_TIMER,__LC_SYNC_ENTER_TIMER,__LC_USER_TIMER UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER + LAST_BREAK + lg %r14,__TI_task(%r12) + lghi %r13,__LC_PGM_TDB + tm __LC_PGM_ILC+2,0x02 # check for transaction abort + jz pgm_no_vtime2 + mvc __THREAD_trap_tdb(256,%r14),0(%r13) pgm_no_vtime2: - lg %r9,__LC_THREAD_INFO # load pointer to thread_info struct - TRACE_IRQS_OFF - lg %r1,__TI_task(%r9) + HANDLE_SIE_INTERCEPT + TRACE_IRQS_CHECK_OFF + lg %r1,__TI_task(%r12) tm SP_PSW+1(%r15),0x01 # kernel per event ? jz kernel_per mvc __THREAD_per+__PER_atmid(2,%r1),__LC_PER_ATMID mvc __THREAD_per+__PER_address(8,%r1),__LC_PER_ADDRESS mvc __THREAD_per+__PER_access_id(1,%r1),__LC_PER_ACCESS_ID - oi __TI_flags+7(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP + oi __TI_flags+7(%r12),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP lgf %r3,__LC_PGM_ILC # load program interruption code lghi %r8,0x7f ngr %r8,%r3 # clear per-event-bit and ilc - je sysc_return - j pgm_do_call + je pgm_exit2 + sll %r8,3 + larl %r1,pgm_check_table + lg %r1,0(%r8,%r1) # load address of handler routine + la %r2,SP_PTREGS(%r15) # address of register-save area + basr %r14,%r1 # branch to interrupt-handler +pgm_exit2: + TRACE_IRQS_ON + stosm __SF_EMPTY(%r15),0x03 # reenable interrupts + j sysc_return # # it was a single stepped SVC that is causing all the trouble # pgm_svcper: - SAVE_ALL_SYNC __LC_SVC_OLD_PSW,__LC_SAVE_AREA - CREATE_STACK_FRAME __LC_SVC_OLD_PSW,__LC_SAVE_AREA + SAVE_ALL_PGM __LC_SVC_OLD_PSW,__LC_SAVE_AREA + CREATE_STACK_FRAME __LC_SAVE_AREA + mvc SP_PSW(16,%r15),__LC_SVC_OLD_PSW + mvc SP_SVCNR(2,%r15),__LC_SVC_INT_CODE + mvc SP_ILC(2,%r15),__LC_SVC_ILC + lg %r12,__LC_THREAD_INFO # load pointer to thread_info struct UPDATE_VTIME __LC_EXIT_TIMER,__LC_SYNC_ENTER_TIMER,__LC_USER_TIMER UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER - llgh %r7,__LC_SVC_INT_CODE # get svc number from lowcore - lg %r9,__LC_THREAD_INFO # load pointer to thread_info struct - lg %r8,__TI_task(%r9) + LAST_BREAK + TRACE_IRQS_OFF + lg %r8,__TI_task(%r12) mvc __THREAD_per+__PER_atmid(2,%r8),__LC_PER_ATMID mvc __THREAD_per+__PER_address(8,%r8),__LC_PER_ADDRESS mvc __THREAD_per+__PER_access_id(1,%r8),__LC_PER_ACCESS_ID - oi __TI_flags+7(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP + oi __TI_flags+7(%r12),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP TRACE_IRQS_ON stosm __SF_EMPTY(%r15),0x03 # reenable interrupts + lmg %r2,%r6,SP_R2(%r15) # load svc arguments j sysc_do_svc # @@ -558,8 +593,8 @@ pgm_svcper: kernel_per: xc SP_SVCNR(2,%r15),SP_SVCNR(%r15) # clear svc number la %r2,SP_PTREGS(%r15) # address of register-save area - larl %r14,sysc_restore # load adr. of system ret, no work - jg do_single_step # branch to do_single_step + brasl %r14,do_single_step + j pgm_exit /* * IO interrupt handler routine @@ -568,162 +603,133 @@ kernel_per: io_int_handler: stck __LC_INT_CLOCK stpt __LC_ASYNC_ENTER_TIMER - SAVE_ALL_BASE __LC_SAVE_AREA+32 - SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+32 - CREATE_STACK_FRAME __LC_IO_OLD_PSW,__LC_SAVE_AREA+32 + SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+40 + CREATE_STACK_FRAME __LC_SAVE_AREA+40 + mvc SP_PSW(16,%r15),0(%r12) # move user PSW to stack + lg %r12,__LC_THREAD_INFO # load pointer to thread_info struct tm SP_PSW+1(%r15),0x01 # interrupting from user ? jz io_no_vtime UPDATE_VTIME __LC_EXIT_TIMER,__LC_ASYNC_ENTER_TIMER,__LC_USER_TIMER UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_ASYNC_ENTER_TIMER + LAST_BREAK io_no_vtime: - lg %r9,__LC_THREAD_INFO # load pointer to thread_info struct + HANDLE_SIE_INTERCEPT TRACE_IRQS_OFF la %r2,SP_PTREGS(%r15) # address of register-save area brasl %r14,do_IRQ # call standard irq handler io_return: - tm __TI_flags+7(%r9),_TIF_WORK_INT + LOCKDEP_SYS_EXIT + TRACE_IRQS_ON +io_tif: + tm __TI_flags+7(%r12),_TIF_WORK_INT jnz io_work # there is work to do (signals etc.) io_restore: -#ifdef CONFIG_TRACE_IRQFLAGS - larl %r1,io_restore_trace_psw - lpswe 0(%r1) -io_restore_trace: - TRACE_IRQS_CHECK - LOCKDEP_SYS_EXIT -#endif -io_leave: RESTORE_ALL __LC_RETURN_PSW,0 io_done: -#ifdef CONFIG_TRACE_IRQFLAGS - .section .data,"aw",@progbits - .align 8 - .globl io_restore_trace_psw -io_restore_trace_psw: - .quad 0, io_restore_trace - .previous -#endif - # -# There is work todo, we need to check if we return to userspace, then -# check, if we are in SIE, if yes leave it +# There is work todo, find out in which context we have been interrupted: +# 1) if we return to user space we can do all _TIF_WORK_INT work +# 2) if we return to kernel code and kvm is enabled check if we need to +# modify the psw to leave SIE +# 3) if we return to kernel code and preemptive scheduling is enabled check +# the preemption counter and if it is zero call preempt_schedule_irq +# Before any work can be done, a switch to the kernel stack is required. # io_work: tm SP_PSW+1(%r15),0x01 # returning to user ? -#ifndef CONFIG_PREEMPT -#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) - jnz io_work_user # yes -> no need to check for SIE - la %r1, BASED(sie_opcode) # we return to kernel here - lg %r2, SP_PSW+8(%r15) - clc 0(2,%r1), 0(%r2) # is current instruction = SIE? - jne io_restore # no-> return to kernel - lg %r1, SP_PSW+8(%r15) # yes-> add 4 bytes to leave SIE - aghi %r1, 4 - stg %r1, SP_PSW+8(%r15) - j io_restore # return to kernel -#else - jno io_restore # no-> skip resched & signal -#endif -#else - jnz io_work_user # yes -> do resched & signal -#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) - la %r1, BASED(sie_opcode) - lg %r2, SP_PSW+8(%r15) - clc 0(2,%r1), 0(%r2) # is current instruction = SIE? - jne 0f # no -> leave PSW alone - lg %r1, SP_PSW+8(%r15) # yes-> add 4 bytes to leave SIE - aghi %r1, 4 - stg %r1, SP_PSW+8(%r15) -0: -#endif + jo io_work_user # yes -> do resched & signal +#ifdef CONFIG_PREEMPT # check for preemptive scheduling - icm %r0,15,__TI_precount(%r9) + icm %r0,15,__TI_precount(%r12) jnz io_restore # preemption is disabled + tm __TI_flags+7(%r12),_TIF_NEED_RESCHED + jno io_restore # switch to kernel stack lg %r1,SP_R15(%r15) aghi %r1,-SP_SIZE mvc SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15) xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) # clear back chain lgr %r15,%r1 -io_resume_loop: - tm __TI_flags+7(%r9),_TIF_NEED_RESCHED - jno io_restore - larl %r14,io_resume_loop - jg preempt_schedule_irq + # TRACE_IRQS_ON already done at io_return, call + # TRACE_IRQS_OFF to keep things symmetrical + TRACE_IRQS_OFF + brasl %r14,preempt_schedule_irq + j io_return +#else + j io_restore #endif +# +# Need to do work before returning to userspace, switch to kernel stack +# io_work_user: lg %r1,__LC_KERNEL_STACK aghi %r1,-SP_SIZE mvc SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15) xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) # clear back chain lgr %r15,%r1 + # # One of the work bits is on. Find out which one. -# Checked are: _TIF_SIGPENDING, _TIF_RESTORE_SIGPENDING, _TIF_NEED_RESCHED +# Checked are: _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_NEED_RESCHED # and _TIF_MCCK_PENDING # -io_work_loop: - tm __TI_flags+7(%r9),_TIF_MCCK_PENDING +io_work_tif: + tm __TI_flags+7(%r12),_TIF_MCCK_PENDING jo io_mcck_pending - tm __TI_flags+7(%r9),_TIF_NEED_RESCHED + tm __TI_flags+7(%r12),_TIF_NEED_RESCHED jo io_reschedule - tm __TI_flags+7(%r9),_TIF_SIGPENDING - jnz io_sigpending - tm __TI_flags+7(%r9),_TIF_NOTIFY_RESUME - jnz io_notify_resume - j io_restore -io_work_done: - -#if defined(CONFIG_KVM) || defined(CONFIG_KVM_MODULE) -sie_opcode: - .long 0xb2140000 -#endif + tm __TI_flags+7(%r12),_TIF_SIGPENDING + jo io_sigpending + tm __TI_flags+7(%r12),_TIF_NOTIFY_RESUME + jo io_notify_resume + j io_return # beware of critical section cleanup # # _TIF_MCCK_PENDING is set, call handler # io_mcck_pending: + # TRACE_IRQS_ON already done at io_return brasl %r14,s390_handle_mcck # TIF bit will be cleared by handler - j io_work_loop + TRACE_IRQS_OFF + j io_return # # _TIF_NEED_RESCHED is set, call schedule # io_reschedule: - TRACE_IRQS_ON + # TRACE_IRQS_ON already done at io_return stosm __SF_EMPTY(%r15),0x03 # reenable interrupts brasl %r14,schedule # call scheduler stnsm __SF_EMPTY(%r15),0xfc # disable I/O and ext. interrupts TRACE_IRQS_OFF - tm __TI_flags+7(%r9),_TIF_WORK_INT - jz io_restore # there is no work to do - j io_work_loop + j io_return # # _TIF_SIGPENDING or is set, call do_signal # io_sigpending: - TRACE_IRQS_ON + # TRACE_IRQS_ON already done at io_return stosm __SF_EMPTY(%r15),0x03 # reenable interrupts la %r2,SP_PTREGS(%r15) # load pt_regs brasl %r14,do_signal # call do_signal stnsm __SF_EMPTY(%r15),0xfc # disable I/O and ext. interrupts TRACE_IRQS_OFF - j io_work_loop + j io_return # # _TIF_NOTIFY_RESUME or is set, call do_notify_resume # io_notify_resume: - TRACE_IRQS_ON + # TRACE_IRQS_ON already done at io_return stosm __SF_EMPTY(%r15),0x03 # reenable interrupts la %r2,SP_PTREGS(%r15) # load pt_regs brasl %r14,do_notify_resume # call do_notify_resume stnsm __SF_EMPTY(%r15),0xfc # disable I/O and ext. interrupts TRACE_IRQS_OFF - j io_work_loop + j io_return /* * External interrupt handler routine @@ -732,16 +738,18 @@ io_notify_resume: ext_int_handler: stck __LC_INT_CLOCK stpt __LC_ASYNC_ENTER_TIMER - SAVE_ALL_BASE __LC_SAVE_AREA+32 - SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+32 - CREATE_STACK_FRAME __LC_EXT_OLD_PSW,__LC_SAVE_AREA+32 + SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+40 + CREATE_STACK_FRAME __LC_SAVE_AREA+40 + mvc SP_PSW(16,%r15),0(%r12) # move user PSW to stack + lg %r12,__LC_THREAD_INFO # load pointer to thread_info struct tm SP_PSW+1(%r15),0x01 # interrupting from user ? jz ext_no_vtime UPDATE_VTIME __LC_EXIT_TIMER,__LC_ASYNC_ENTER_TIMER,__LC_USER_TIMER UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_ASYNC_ENTER_TIMER + LAST_BREAK ext_no_vtime: - lg %r9,__LC_THREAD_INFO # load pointer to thread_info struct + HANDLE_SIE_INTERCEPT TRACE_IRQS_OFF la %r2,SP_PTREGS(%r15) # address of register-save area llgh %r3,__LC_EXT_INT_CODE # get interruption code @@ -759,12 +767,14 @@ mcck_int_handler: la %r1,4095 # revalidate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs - SAVE_ALL_BASE __LC_SAVE_AREA+64 + stmg %r11,%r15,__LC_SAVE_AREA+80 + larl %r13,system_call + lg %r11,__LC_LAST_BREAK la %r12,__LC_MCK_OLD_PSW tm __LC_MCCK_CODE,0x80 # system damage? jo mcck_int_main # yes -> rest of mcck code invalid la %r14,4095 - mvc __LC_SAVE_AREA+104(8),__LC_ASYNC_ENTER_TIMER + mvc __LC_SAVE_AREA+120(8),__LC_ASYNC_ENTER_TIMER mvc __LC_ASYNC_ENTER_TIMER(8),__LC_CPU_TIMER_SAVE_AREA-4095(%r14) tm __LC_MCCK_CODE+5,0x02 # stored cpu timer value valid? jo 1f @@ -795,7 +805,10 @@ mcck_int_main: srag %r14,%r14,PAGE_SHIFT jz 0f lg %r15,__LC_PANIC_STACK # load panic stack -0: CREATE_STACK_FRAME __LC_MCK_OLD_PSW,__LC_SAVE_AREA+64 +0: aghi %r15,-SP_SIZE # make room for registers & psw + CREATE_STACK_FRAME __LC_SAVE_AREA+80 + mvc SP_PSW(16,%r15),0(%r12) + lg %r12,__LC_THREAD_INFO # load pointer to thread_info struct tm __LC_MCCK_CODE+2,0x08 # mwp of old psw valid? jno mcck_no_vtime # no -> no timer update tm SP_PSW+1(%r15),0x01 # interrupting from user ? @@ -803,8 +816,8 @@ mcck_int_main: UPDATE_VTIME __LC_EXIT_TIMER,__LC_ASYNC_ENTER_TIMER,__LC_USER_TIMER UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_ASYNC_ENTER_TIMER + LAST_BREAK mcck_no_vtime: - lg %r9,__LC_THREAD_INFO # load pointer to thread_info struct la %r2,SP_PTREGS(%r15) # load pt_regs brasl %r14,s390_do_machine_check tm SP_PSW+1(%r15),0x01 # returning to user ? @@ -815,8 +828,9 @@ mcck_no_vtime: xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) # clear back chain lgr %r15,%r1 stosm __SF_EMPTY(%r15),0x04 # turn dat on - tm __TI_flags+7(%r9),_TIF_MCCK_PENDING + tm __TI_flags+7(%r12),_TIF_MCCK_PENDING jno mcck_return + HANDLE_SIE_INTERCEPT TRACE_IRQS_OFF brasl %r14,s390_handle_mcck TRACE_IRQS_ON @@ -824,11 +838,12 @@ mcck_return: mvc __LC_RETURN_MCCK_PSW(16),SP_PSW(%r15) # move return PSW ni __LC_RETURN_MCCK_PSW+1,0xfd # clear wait state bit lmg %r0,%r15,SP_R0(%r15) # load gprs 0-15 - mvc __LC_ASYNC_ENTER_TIMER(8),__LC_SAVE_AREA+104 + mvc __LC_ASYNC_ENTER_TIMER(8),__LC_SAVE_AREA+120 tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? jno 0f stpt __LC_EXIT_TIMER 0: lpswe __LC_RETURN_MCCK_PSW # back to caller +mcck_done: /* * Restart interruption handler, kick starter for additional CPUs @@ -874,6 +889,28 @@ restart_crash: restart_go: #endif +# +# PSW restart interrupt handler +# + .globl psw_restart_int_handler +psw_restart_int_handler: + stg %r15,__LC_SAVE_AREA+120(%r0) # save r15 + larl %r15,restart_stack # load restart stack + lg %r15,0(%r15) + aghi %r15,-SP_SIZE # make room for pt_regs + stmg %r0,%r14,SP_R0(%r15) # store gprs %r0-%r14 to stack + mvc SP_R15(8,%r15),__LC_SAVE_AREA+120(%r0)# store saved %r15 to stack + mvc SP_PSW(16,%r15),__LC_RST_OLD_PSW(%r0)# store restart old psw + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) # set backchain to 0 + brasl %r14,do_restart + + larl %r14,restart_psw_crash # load disabled wait PSW if + lpswe 0(%r14) # do_restart returns + .align 8 +restart_psw_crash: + .quad 0x0002000080000000,0x0000000000000000 + restart_psw_crash + + #ifdef CONFIG_CHECK_STACK /* * The synchronous or the asynchronous stack overflowed. We are dead. @@ -884,14 +921,14 @@ stack_overflow: lg %r15,__LC_PANIC_STACK # change to panic stack aghi %r15,-SP_SIZE mvc SP_PSW(16,%r15),0(%r12) # move user PSW to stack - stmg %r0,%r11,SP_R0(%r15) # store gprs %r0-%r11 to kernel stack + stmg %r0,%r10,SP_R0(%r15) # store gprs %r0-%r10 to kernel stack la %r1,__LC_SAVE_AREA chi %r12,__LC_SVC_OLD_PSW je 0f chi %r12,__LC_PGM_OLD_PSW je 0f - la %r1,__LC_SAVE_AREA+32 -0: mvc SP_R12(32,%r15),0(%r1) # move %r12-%r15 to stack + la %r1,__LC_SAVE_AREA+40 +0: mvc SP_R11(40,%r15),0(%r1) # move %r11-%r15 to stack mvc SP_ARGS(8,%r15),__LC_LAST_BREAK xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) # clear back chain la %r2,SP_PTREGS(%r15) # load pt_regs @@ -900,18 +937,14 @@ stack_overflow: cleanup_table_system_call: .quad system_call, sysc_do_svc -cleanup_table_sysc_return: - .quad sysc_return, sysc_leave -cleanup_table_sysc_leave: - .quad sysc_leave, sysc_done -cleanup_table_sysc_work_loop: - .quad sysc_work_loop, sysc_work_done -cleanup_table_io_return: - .quad io_return, io_leave -cleanup_table_io_leave: - .quad io_leave, io_done -cleanup_table_io_work_loop: - .quad io_work_loop, io_work_done +cleanup_table_sysc_tif: + .quad sysc_tif, sysc_restore +cleanup_table_sysc_restore: + .quad sysc_restore, sysc_done +cleanup_table_io_tif: + .quad io_tif, io_restore +cleanup_table_io_restore: + .quad io_restore, io_done cleanup_critical: clc 8(8,%r12),BASED(cleanup_table_system_call) @@ -919,61 +952,52 @@ cleanup_critical: clc 8(8,%r12),BASED(cleanup_table_system_call+8) jl cleanup_system_call 0: - clc 8(8,%r12),BASED(cleanup_table_sysc_return) - jl 0f - clc 8(8,%r12),BASED(cleanup_table_sysc_return+8) - jl cleanup_sysc_return -0: - clc 8(8,%r12),BASED(cleanup_table_sysc_leave) + clc 8(8,%r12),BASED(cleanup_table_sysc_tif) jl 0f - clc 8(8,%r12),BASED(cleanup_table_sysc_leave+8) - jl cleanup_sysc_leave + clc 8(8,%r12),BASED(cleanup_table_sysc_tif+8) + jl cleanup_sysc_tif 0: - clc 8(8,%r12),BASED(cleanup_table_sysc_work_loop) + clc 8(8,%r12),BASED(cleanup_table_sysc_restore) jl 0f - clc 8(8,%r12),BASED(cleanup_table_sysc_work_loop+8) - jl cleanup_sysc_return + clc 8(8,%r12),BASED(cleanup_table_sysc_restore+8) + jl cleanup_sysc_restore 0: - clc 8(8,%r12),BASED(cleanup_table_io_return) + clc 8(8,%r12),BASED(cleanup_table_io_tif) jl 0f - clc 8(8,%r12),BASED(cleanup_table_io_return+8) - jl cleanup_io_return + clc 8(8,%r12),BASED(cleanup_table_io_tif+8) + jl cleanup_io_tif 0: - clc 8(8,%r12),BASED(cleanup_table_io_leave) + clc 8(8,%r12),BASED(cleanup_table_io_restore) jl 0f - clc 8(8,%r12),BASED(cleanup_table_io_leave+8) - jl cleanup_io_leave -0: - clc 8(8,%r12),BASED(cleanup_table_io_work_loop) - jl 0f - clc 8(8,%r12),BASED(cleanup_table_io_work_loop+8) - jl cleanup_io_return + clc 8(8,%r12),BASED(cleanup_table_io_restore+8) + jl cleanup_io_restore 0: br %r14 cleanup_system_call: mvc __LC_RETURN_PSW(16),0(%r12) - cghi %r12,__LC_MCK_OLD_PSW - je 0f - la %r12,__LC_SAVE_AREA+32 - j 1f -0: la %r12,__LC_SAVE_AREA+64 -1: clc __LC_RETURN_PSW+8(8),BASED(cleanup_system_call_insn+8) jh 0f mvc __LC_SYNC_ENTER_TIMER(8),__LC_ASYNC_ENTER_TIMER +0: cghi %r12,__LC_MCK_OLD_PSW + la %r12,__LC_SAVE_AREA+80 + je 0f + la %r12,__LC_SAVE_AREA+40 0: clc __LC_RETURN_PSW+8(8),BASED(cleanup_system_call_insn+16) jhe cleanup_vtime clc __LC_RETURN_PSW+8(8),BASED(cleanup_system_call_insn) jh 0f - mvc __LC_SAVE_AREA(32),0(%r12) -0: stg %r13,8(%r12) - stg %r12,__LC_SAVE_AREA+96 # argh - SAVE_ALL_SYNC __LC_SVC_OLD_PSW,__LC_SAVE_AREA - CREATE_STACK_FRAME __LC_SVC_OLD_PSW,__LC_SAVE_AREA - lg %r12,__LC_SAVE_AREA+96 # argh - stg %r15,24(%r12) - llgh %r7,__LC_SVC_INT_CODE + mvc __LC_SAVE_AREA(40),0(%r12) +0: lg %r15,__LC_KERNEL_STACK # problem state -> load ksp + aghi %r15,-SP_SIZE # make room for registers & psw + stg %r15,32(%r12) + stg %r11,0(%r12) + CREATE_STACK_FRAME __LC_SAVE_AREA + mvc SP_PSW(16,%r15),__LC_SVC_OLD_PSW + mvc SP_SVCNR(2,%r15),__LC_SVC_INT_CODE + mvc SP_ILC(2,%r15),__LC_SVC_ILC + stg %r7,SP_ARGS(%r15) + mvc 8(8,%r12),__LC_THREAD_INFO cleanup_vtime: clc __LC_RETURN_PSW+8(8),BASED(cleanup_system_call_insn+24) jhe cleanup_stime @@ -984,7 +1008,11 @@ cleanup_stime: UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER cleanup_update: mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER - mvc __LC_RETURN_PSW+8(8),BASED(cleanup_table_system_call+8) + srag %r12,%r11,23 + lg %r12,__LC_THREAD_INFO + jz 0f + stg %r11,__TI_last_break(%r12) +0: mvc __LC_RETURN_PSW+8(8),BASED(cleanup_table_system_call+8) la %r12,__LC_RETURN_PSW br %r14 cleanup_system_call_insn: @@ -994,55 +1022,51 @@ cleanup_system_call_insn: .quad sysc_stime .quad sysc_update -cleanup_sysc_return: +cleanup_sysc_tif: mvc __LC_RETURN_PSW(8),0(%r12) - mvc __LC_RETURN_PSW+8(8),BASED(cleanup_table_sysc_return) + mvc __LC_RETURN_PSW+8(8),BASED(cleanup_table_sysc_tif) la %r12,__LC_RETURN_PSW br %r14 -cleanup_sysc_leave: - clc 8(8,%r12),BASED(cleanup_sysc_leave_insn) - je 3f - clc 8(8,%r12),BASED(cleanup_sysc_leave_insn+8) +cleanup_sysc_restore: + clc 8(8,%r12),BASED(cleanup_sysc_restore_insn) + je 2f + clc 8(8,%r12),BASED(cleanup_sysc_restore_insn+8) jhe 0f mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER 0: mvc __LC_RETURN_PSW(16),SP_PSW(%r15) cghi %r12,__LC_MCK_OLD_PSW - jne 1f - mvc __LC_SAVE_AREA+64(32),SP_R12(%r15) - j 2f -1: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) -2: lmg %r0,%r11,SP_R0(%r15) + la %r12,__LC_SAVE_AREA+80 + je 1f + la %r12,__LC_SAVE_AREA+40 +1: mvc 0(40,%r12),SP_R11(%r15) + lmg %r0,%r10,SP_R0(%r15) lg %r15,SP_R15(%r15) -3: la %r12,__LC_RETURN_PSW +2: la %r12,__LC_RETURN_PSW br %r14 -cleanup_sysc_leave_insn: +cleanup_sysc_restore_insn: .quad sysc_done - 4 .quad sysc_done - 16 -cleanup_io_return: +cleanup_io_tif: mvc __LC_RETURN_PSW(8),0(%r12) - mvc __LC_RETURN_PSW+8(8),BASED(cleanup_table_io_work_loop) + mvc __LC_RETURN_PSW+8(8),BASED(cleanup_table_io_tif) la %r12,__LC_RETURN_PSW br %r14 -cleanup_io_leave: - clc 8(8,%r12),BASED(cleanup_io_leave_insn) - je 3f - clc 8(8,%r12),BASED(cleanup_io_leave_insn+8) +cleanup_io_restore: + clc 8(8,%r12),BASED(cleanup_io_restore_insn) + je 1f + clc 8(8,%r12),BASED(cleanup_io_restore_insn+8) jhe 0f mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER 0: mvc __LC_RETURN_PSW(16),SP_PSW(%r15) - cghi %r12,__LC_MCK_OLD_PSW - jne 1f - mvc __LC_SAVE_AREA+64(32),SP_R12(%r15) - j 2f -1: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) -2: lmg %r0,%r11,SP_R0(%r15) + mvc __LC_SAVE_AREA+80(40),SP_R11(%r15) + lmg %r0,%r10,SP_R0(%r15) lg %r15,SP_R15(%r15) -3: la %r12,__LC_RETURN_PSW +1: la %r12,__LC_RETURN_PSW br %r14 -cleanup_io_leave_insn: +cleanup_io_restore_insn: .quad io_done - 4 .quad io_done - 16 @@ -1050,13 +1074,6 @@ cleanup_io_leave_insn: * Integer constants */ .align 4 -.Lconst: -.Lnr_syscalls: .long NR_syscalls -.L0x0130: .short 0x130 -.L0x0140: .short 0x140 -.L0x0150: .short 0x150 -.L0x0160: .short 0x160 -.L0x0170: .short 0x170 .Lcritical_start: .quad __critical_start .Lcritical_end: diff -uprN linux-2.6.32/arch/s390/kernel/entry.S linux-2.6.32.ovz/arch/s390/kernel/entry.S --- linux-2.6.32/arch/s390/kernel/entry.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/entry.S 2015-03-10 13:23:48.000000000 -0700 @@ -74,21 +74,24 @@ STACK_SIZE = 1 << STACK_SHIFT basr %r14,%r1 .endm - .macro TRACE_IRQS_CHECK - basr %r2,%r0 + .macro TRACE_IRQS_CHECK_ON tm SP_PSW(%r15),0x03 # irqs enabled? - jz 0f - l %r1,BASED(.Ltrace_irq_on_caller) - basr %r14,%r1 - j 1f -0: l %r1,BASED(.Ltrace_irq_off_caller) - basr %r14,%r1 -1: + bz BASED(0f) + TRACE_IRQS_ON +0: + .endm + + .macro TRACE_IRQS_CHECK_OFF + tm SP_PSW(%r15),0x03 # irqs enabled? + bz BASED(0f) + TRACE_IRQS_OFF +0: .endm #else #define TRACE_IRQS_ON #define TRACE_IRQS_OFF -#define TRACE_IRQS_CHECK +#define TRACE_IRQS_CHECK_ON +#define TRACE_IRQS_CHECK_OFF #endif #ifdef CONFIG_LOCKDEP @@ -207,13 +210,6 @@ STACK_SIZE = 1 << STACK_SHIFT __switch_to: basr %r1,0 __switch_to_base: - tm __THREAD_per(%r3),0xe8 # new process is using per ? - bz __switch_to_noper-__switch_to_base(%r1) # if not we're fine - stctl %c9,%c11,__SF_EMPTY(%r15) # We are using per stuff - clc __THREAD_per(12,%r3),__SF_EMPTY(%r15) - be __switch_to_noper-__switch_to_base(%r1) # we got away w/o bashing TLB's - lctl %c9,%c11,__THREAD_per(%r3) # Nope we didn't -__switch_to_noper: l %r4,__THREAD_info(%r2) # get thread_info of prev tm __TI_flags+3(%r4),_TIF_MCCK_PENDING # machine check pending? bz __switch_to_no_mcck-__switch_to_base(%r1) @@ -227,6 +223,7 @@ __switch_to_no_mcck: lm %r6,%r15,__SF_GPRS(%r15)# load __switch_to registers of next task st %r3,__LC_CURRENT # __LC_CURRENT = current task struct lctl %c4,%c4,__TASK_pid(%r3) # load pid to control reg. 4 + mvc __LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next l %r3,__THREAD_info(%r3) # load thread_info from task struct st %r3,__LC_THREAD_INFO ahi %r3,STACK_SIZE @@ -274,66 +271,45 @@ sysc_do_restart: st %r2,SP_R2(%r15) # store return value (change R2 on stack) sysc_return: + LOCKDEP_SYS_EXIT +sysc_tif: tm __TI_flags+3(%r9),_TIF_WORK_SVC bnz BASED(sysc_work) # there is work to do (signals etc.) sysc_restore: -#ifdef CONFIG_TRACE_IRQFLAGS - la %r1,BASED(sysc_restore_trace_psw_addr) - l %r1,0(%r1) - lpsw 0(%r1) -sysc_restore_trace: - TRACE_IRQS_CHECK - LOCKDEP_SYS_EXIT -#endif -sysc_leave: RESTORE_ALL __LC_RETURN_PSW,1 sysc_done: -#ifdef CONFIG_TRACE_IRQFLAGS -sysc_restore_trace_psw_addr: - .long sysc_restore_trace_psw - - .section .data,"aw",@progbits - .align 8 - .globl sysc_restore_trace_psw -sysc_restore_trace_psw: - .long 0, sysc_restore_trace + 0x80000000 - .previous -#endif - -# -# recheck if there is more work to do # -sysc_work_loop: - tm __TI_flags+3(%r9),_TIF_WORK_SVC - bz BASED(sysc_restore) # there is no work to do -# -# One of the work bits is on. Find out which one. +# There is work to do, but first we need to check if we return to userspace. # sysc_work: tm SP_PSW+1(%r15),0x01 # returning to user ? bno BASED(sysc_restore) + +# +# One of the work bits is on. Find out which one. +# +sysc_work_tif: tm __TI_flags+3(%r9),_TIF_MCCK_PENDING bo BASED(sysc_mcck_pending) tm __TI_flags+3(%r9),_TIF_NEED_RESCHED bo BASED(sysc_reschedule) tm __TI_flags+3(%r9),_TIF_SIGPENDING - bnz BASED(sysc_sigpending) + bo BASED(sysc_sigpending) tm __TI_flags+3(%r9),_TIF_NOTIFY_RESUME - bnz BASED(sysc_notify_resume) + bo BASED(sysc_notify_resume) tm __TI_flags+3(%r9),_TIF_RESTART_SVC bo BASED(sysc_restart) tm __TI_flags+3(%r9),_TIF_SINGLE_STEP bo BASED(sysc_singlestep) - b BASED(sysc_restore) -sysc_work_done: + b BASED(sysc_return) # beware of critical section cleanup # # _TIF_NEED_RESCHED is set, call schedule # sysc_reschedule: l %r1,BASED(.Lschedule) - la %r14,BASED(sysc_work_loop) + la %r14,BASED(sysc_return) br %r1 # call scheduler # @@ -341,7 +317,7 @@ sysc_reschedule: # sysc_mcck_pending: l %r1,BASED(.Ls390_handle_mcck) - la %r14,BASED(sysc_work_loop) + la %r14,BASED(sysc_return) br %r1 # TIF bit will be cleared by handler # @@ -356,7 +332,7 @@ sysc_sigpending: bo BASED(sysc_restart) tm __TI_flags+3(%r9),_TIF_SINGLE_STEP bo BASED(sysc_singlestep) - b BASED(sysc_work_loop) + b BASED(sysc_return) # # _TIF_NOTIFY_RESUME is set, call do_notify_resume @@ -364,7 +340,7 @@ sysc_sigpending: sysc_notify_resume: la %r2,SP_PTREGS(%r15) # load pt_regs l %r1,BASED(.Ldo_notify_resume) - la %r14,BASED(sysc_work_loop) + la %r14,BASED(sysc_return) br %r1 # call do_notify_resume @@ -459,11 +435,13 @@ kernel_execve: br %r14 # execve succeeded. 0: stnsm __SF_EMPTY(%r15),0xfc # disable interrupts + TRACE_IRQS_OFF l %r15,__LC_KERNEL_STACK # load ksp s %r15,BASED(.Lc_spsize) # make room for registers & psw l %r9,__LC_THREAD_INFO mvc SP_PTREGS(__PT_SIZE,%r15),0(%r12) # copy pt_regs xc __SF_BACKCHAIN(4,%r15),__SF_BACKCHAIN(%r15) + TRACE_IRQS_ON stosm __SF_EMPTY(%r15),0x03 # reenable interrupts l %r1,BASED(.Lexecve_tail) basr %r14,%r1 @@ -500,8 +478,8 @@ pgm_check_handler: UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER pgm_no_vtime: + TRACE_IRQS_CHECK_OFF l %r9,__LC_THREAD_INFO # load pointer to thread_info struct - TRACE_IRQS_OFF l %r3,__LC_PGM_ILC # load program interruption code la %r8,0x7f nr %r8,%r3 @@ -510,8 +488,10 @@ pgm_do_call: sll %r8,2 l %r7,0(%r8,%r7) # load address of handler routine la %r2,SP_PTREGS(%r15) # address of register-save area - la %r14,BASED(sysc_return) - br %r7 # branch to interrupt-handler + basr %r14,%r7 # branch to interrupt-handler +pgm_exit: + TRACE_IRQS_CHECK_ON + b BASED(sysc_return) # # handle per exception @@ -538,20 +518,28 @@ pgm_per_std: UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_SYNC_ENTER_TIMER pgm_no_vtime2: + TRACE_IRQS_CHECK_OFF l %r9,__LC_THREAD_INFO # load pointer to thread_info struct - TRACE_IRQS_OFF l %r1,__TI_task(%r9) + tm SP_PSW+1(%r15),0x01 # kernel per event ? + bz BASED(kernel_per) mvc __THREAD_per+__PER_atmid(2,%r1),__LC_PER_ATMID mvc __THREAD_per+__PER_address(4,%r1),__LC_PER_ADDRESS mvc __THREAD_per+__PER_access_id(1,%r1),__LC_PER_ACCESS_ID oi __TI_flags+3(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP - tm SP_PSW+1(%r15),0x01 # kernel per event ? - bz BASED(kernel_per) l %r3,__LC_PGM_ILC # load program interruption code la %r8,0x7f nr %r8,%r3 # clear per-event-bit and ilc - be BASED(sysc_return) # only per or per+check ? - b BASED(pgm_do_call) + be BASED(pgm_exit2) # only per or per+check ? + l %r7,BASED(.Ljump_table) + sll %r8,2 + l %r7,0(%r8,%r7) # load address of handler routine + la %r2,SP_PTREGS(%r15) # address of register-save area + basr %r14,%r7 # branch to interrupt-handler +pgm_exit2: + TRACE_IRQS_ON + stosm __SF_EMPTY(%r15),0x03 # reenable interrupts + b BASED(sysc_return) # # it was a single stepped SVC that is causing all the trouble @@ -572,6 +560,7 @@ pgm_svcper: oi __TI_flags+3(%r9),_TIF_SINGLE_STEP # set TIF_SINGLE_STEP TRACE_IRQS_ON stosm __SF_EMPTY(%r15),0x03 # reenable interrupts + lm %r2,%r6,SP_R2(%r15) # load svc arguments b BASED(sysc_do_svc) # @@ -582,8 +571,8 @@ kernel_per: mvi SP_SVCNR+1(%r15),0xff la %r2,SP_PTREGS(%r15) # address of register-save area l %r1,BASED(.Lhandle_per) # load adr. of per handler - la %r14,BASED(sysc_restore)# load adr. of system return - br %r1 # branch to do_single_step + basr %r14,%r1 # branch to do_single_step + b BASED(pgm_exit) /* * IO interrupt handler routine @@ -602,134 +591,126 @@ io_int_handler: UPDATE_VTIME __LC_LAST_UPDATE_TIMER,__LC_EXIT_TIMER,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),__LC_ASYNC_ENTER_TIMER io_no_vtime: - l %r9,__LC_THREAD_INFO # load pointer to thread_info struct TRACE_IRQS_OFF + l %r9,__LC_THREAD_INFO # load pointer to thread_info struct l %r1,BASED(.Ldo_IRQ) # load address of do_IRQ la %r2,SP_PTREGS(%r15) # address of register-save area basr %r14,%r1 # branch to standard irq handler io_return: + LOCKDEP_SYS_EXIT + TRACE_IRQS_ON +io_tif: tm __TI_flags+3(%r9),_TIF_WORK_INT bnz BASED(io_work) # there is work to do (signals etc.) io_restore: -#ifdef CONFIG_TRACE_IRQFLAGS - la %r1,BASED(io_restore_trace_psw_addr) - l %r1,0(%r1) - lpsw 0(%r1) -io_restore_trace: - TRACE_IRQS_CHECK - LOCKDEP_SYS_EXIT -#endif -io_leave: RESTORE_ALL __LC_RETURN_PSW,0 io_done: -#ifdef CONFIG_TRACE_IRQFLAGS -io_restore_trace_psw_addr: - .long io_restore_trace_psw - - .section .data,"aw",@progbits - .align 8 - .globl io_restore_trace_psw -io_restore_trace_psw: - .long 0, io_restore_trace + 0x80000000 - .previous -#endif - # -# switch to kernel stack, then check the TIF bits +# There is work todo, find out in which context we have been interrupted: +# 1) if we return to user space we can do all _TIF_WORK_INT work +# 2) if we return to kernel code and preemptive scheduling is enabled check +# the preemption counter and if it is zero call preempt_schedule_irq +# Before any work can be done, a switch to the kernel stack is required. # io_work: tm SP_PSW+1(%r15),0x01 # returning to user ? -#ifndef CONFIG_PREEMPT - bno BASED(io_restore) # no-> skip resched & signal -#else - bnz BASED(io_work_user) # no -> check for preemptive scheduling + bo BASED(io_work_user) # yes -> do resched & signal +#ifdef CONFIG_PREEMPT # check for preemptive scheduling icm %r0,15,__TI_precount(%r9) bnz BASED(io_restore) # preemption disabled + tm __TI_flags+3(%r9),_TIF_NEED_RESCHED + bno BASED(io_restore) + # switch to kernel stack l %r1,SP_R15(%r15) s %r1,BASED(.Lc_spsize) mvc SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15) xc __SF_BACKCHAIN(4,%r1),__SF_BACKCHAIN(%r1) # clear back chain lr %r15,%r1 -io_resume_loop: - tm __TI_flags+3(%r9),_TIF_NEED_RESCHED - bno BASED(io_restore) + # TRACE_IRQS_ON already done at io_return, call + # TRACE_IRQS_OFF to keep things symmetrical + TRACE_IRQS_OFF l %r1,BASED(.Lpreempt_schedule_irq) - la %r14,BASED(io_resume_loop) - br %r1 # call schedule + basr %r14,%r1 # call preempt_schedule_irq + b BASED(io_return) +#else + b BASED(io_restore) #endif +# +# Need to do work before returning to userspace, switch to kernel stack +# io_work_user: l %r1,__LC_KERNEL_STACK s %r1,BASED(.Lc_spsize) mvc SP_PTREGS(__PT_SIZE,%r1),SP_PTREGS(%r15) xc __SF_BACKCHAIN(4,%r1),__SF_BACKCHAIN(%r1) # clear back chain lr %r15,%r1 + # # One of the work bits is on. Find out which one. -# Checked are: _TIF_SIGPENDING, _TIF_NEED_RESCHED +# Checked are: _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_NEED_RESCHED # and _TIF_MCCK_PENDING # -io_work_loop: +io_work_tif: tm __TI_flags+3(%r9),_TIF_MCCK_PENDING bo BASED(io_mcck_pending) tm __TI_flags+3(%r9),_TIF_NEED_RESCHED bo BASED(io_reschedule) tm __TI_flags+3(%r9),_TIF_SIGPENDING - bnz BASED(io_sigpending) + bo BASED(io_sigpending) tm __TI_flags+3(%r9),_TIF_NOTIFY_RESUME - bnz BASED(io_notify_resume) - b BASED(io_restore) -io_work_done: + bo BASED(io_notify_resume) + b BASED(io_return) # beware of critical section cleanup # # _TIF_MCCK_PENDING is set, call handler # io_mcck_pending: + # TRACE_IRQS_ON already done at io_return l %r1,BASED(.Ls390_handle_mcck) basr %r14,%r1 # TIF bit will be cleared by handler - b BASED(io_work_loop) + TRACE_IRQS_OFF + b BASED(io_return) # # _TIF_NEED_RESCHED is set, call schedule # io_reschedule: - TRACE_IRQS_ON + # TRACE_IRQS_ON already done at io_return l %r1,BASED(.Lschedule) stosm __SF_EMPTY(%r15),0x03 # reenable interrupts basr %r14,%r1 # call scheduler stnsm __SF_EMPTY(%r15),0xfc # disable I/O and ext. interrupts TRACE_IRQS_OFF - tm __TI_flags+3(%r9),_TIF_WORK_INT - bz BASED(io_restore) # there is no work to do - b BASED(io_work_loop) + b BASED(io_return) # # _TIF_SIGPENDING is set, call do_signal # io_sigpending: - TRACE_IRQS_ON + # TRACE_IRQS_ON already done at io_return stosm __SF_EMPTY(%r15),0x03 # reenable interrupts la %r2,SP_PTREGS(%r15) # load pt_regs l %r1,BASED(.Ldo_signal) basr %r14,%r1 # call do_signal stnsm __SF_EMPTY(%r15),0xfc # disable I/O and ext. interrupts TRACE_IRQS_OFF - b BASED(io_work_loop) + b BASED(io_return) # # _TIF_SIGPENDING is set, call do_signal # io_notify_resume: - TRACE_IRQS_ON + # TRACE_IRQS_ON already done at io_return stosm __SF_EMPTY(%r15),0x03 # reenable interrupts la %r2,SP_PTREGS(%r15) # load pt_regs l %r1,BASED(.Ldo_notify_resume) basr %r14,%r1 # call do_signal stnsm __SF_EMPTY(%r15),0xfc # disable I/O and ext. interrupts TRACE_IRQS_OFF - b BASED(io_work_loop) + b BASED(io_return) /* * External interrupt handler routine @@ -890,6 +871,36 @@ restart_crash: restart_go: #endif +# +# PSW restart interrupt handler +# + .globl psw_restart_int_handler +psw_restart_int_handler: + st %r15,__LC_SAVE_AREA+48(%r0) # save r15 + basr %r15,0 +0: l %r15,.Lrestart_stack-0b(%r15) # load restart stack + l %r15,0(%r15) + ahi %r15,-SP_SIZE # make room for pt_regs + stm %r0,%r14,SP_R0(%r15) # store gprs %r0-%r14 to stack + mvc SP_R15(4,%r15),__LC_SAVE_AREA+48(%r0)# store saved %r15 to stack + mvc SP_PSW(8,%r15),__LC_RST_OLD_PSW(%r0) # store restart old psw + xc __SF_BACKCHAIN(4,%r15),__SF_BACKCHAIN(%r15) # set backchain to 0 + basr %r14,0 +1: l %r14,.Ldo_restart-1b(%r14) + basr %r14,%r14 + + basr %r14,0 # load disabled wait PSW if +2: lpsw restart_psw_crash-2b(%r14) # do_restart returns + .align 4 +.Ldo_restart: + .long do_restart +.Lrestart_stack: + .long restart_stack + .align 8 +restart_psw_crash: + .long 0x000a0000,0x00000000 + restart_psw_crash + + #ifdef CONFIG_CHECK_STACK /* * The synchronous or the asynchronous stack overflowed. We are dead. @@ -917,18 +928,14 @@ stack_overflow: cleanup_table_system_call: .long system_call + 0x80000000, sysc_do_svc + 0x80000000 -cleanup_table_sysc_return: - .long sysc_return + 0x80000000, sysc_leave + 0x80000000 -cleanup_table_sysc_leave: - .long sysc_leave + 0x80000000, sysc_done + 0x80000000 -cleanup_table_sysc_work_loop: - .long sysc_work_loop + 0x80000000, sysc_work_done + 0x80000000 -cleanup_table_io_return: - .long io_return + 0x80000000, io_leave + 0x80000000 -cleanup_table_io_leave: - .long io_leave + 0x80000000, io_done + 0x80000000 -cleanup_table_io_work_loop: - .long io_work_loop + 0x80000000, io_work_done + 0x80000000 +cleanup_table_sysc_tif: + .long sysc_tif + 0x80000000, sysc_restore + 0x80000000 +cleanup_table_sysc_restore: + .long sysc_restore + 0x80000000, sysc_done + 0x80000000 +cleanup_table_io_tif: + .long io_tif + 0x80000000, io_restore + 0x80000000 +cleanup_table_io_restore: + .long io_restore + 0x80000000, io_done + 0x80000000 cleanup_critical: clc 4(4,%r12),BASED(cleanup_table_system_call) @@ -936,35 +943,25 @@ cleanup_critical: clc 4(4,%r12),BASED(cleanup_table_system_call+4) bl BASED(cleanup_system_call) 0: - clc 4(4,%r12),BASED(cleanup_table_sysc_return) - bl BASED(0f) - clc 4(4,%r12),BASED(cleanup_table_sysc_return+4) - bl BASED(cleanup_sysc_return) -0: - clc 4(4,%r12),BASED(cleanup_table_sysc_leave) - bl BASED(0f) - clc 4(4,%r12),BASED(cleanup_table_sysc_leave+4) - bl BASED(cleanup_sysc_leave) -0: - clc 4(4,%r12),BASED(cleanup_table_sysc_work_loop) + clc 4(4,%r12),BASED(cleanup_table_sysc_tif) bl BASED(0f) - clc 4(4,%r12),BASED(cleanup_table_sysc_work_loop+4) - bl BASED(cleanup_sysc_return) + clc 4(4,%r12),BASED(cleanup_table_sysc_tif+4) + bl BASED(cleanup_sysc_tif) 0: - clc 4(4,%r12),BASED(cleanup_table_io_return) + clc 4(4,%r12),BASED(cleanup_table_sysc_restore) bl BASED(0f) - clc 4(4,%r12),BASED(cleanup_table_io_return+4) - bl BASED(cleanup_io_return) + clc 4(4,%r12),BASED(cleanup_table_sysc_restore+4) + bl BASED(cleanup_sysc_restore) 0: - clc 4(4,%r12),BASED(cleanup_table_io_leave) + clc 4(4,%r12),BASED(cleanup_table_io_tif) bl BASED(0f) - clc 4(4,%r12),BASED(cleanup_table_io_leave+4) - bl BASED(cleanup_io_leave) + clc 4(4,%r12),BASED(cleanup_table_io_tif+4) + bl BASED(cleanup_io_tif) 0: - clc 4(4,%r12),BASED(cleanup_table_io_work_loop) + clc 4(4,%r12),BASED(cleanup_table_io_restore) bl BASED(0f) - clc 4(4,%r12),BASED(cleanup_table_io_work_loop+4) - bl BASED(cleanup_io_return) + clc 4(4,%r12),BASED(cleanup_table_io_restore+4) + bl BASED(cleanup_io_restore) 0: br %r14 @@ -1011,17 +1008,17 @@ cleanup_system_call_insn: .long sysc_stime + 0x80000000 .long sysc_update + 0x80000000 -cleanup_sysc_return: +cleanup_sysc_tif: mvc __LC_RETURN_PSW(4),0(%r12) - mvc __LC_RETURN_PSW+4(4),BASED(cleanup_table_sysc_return) + mvc __LC_RETURN_PSW+4(4),BASED(cleanup_table_sysc_tif) la %r12,__LC_RETURN_PSW br %r14 -cleanup_sysc_leave: - clc 4(4,%r12),BASED(cleanup_sysc_leave_insn) +cleanup_sysc_restore: + clc 4(4,%r12),BASED(cleanup_sysc_restore_insn) be BASED(2f) mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER - clc 4(4,%r12),BASED(cleanup_sysc_leave_insn+4) + clc 4(4,%r12),BASED(cleanup_sysc_restore_insn+4) be BASED(2f) mvc __LC_RETURN_PSW(8),SP_PSW(%r15) c %r12,BASED(.Lmck_old_psw) @@ -1033,21 +1030,21 @@ cleanup_sysc_leave: l %r15,SP_R15(%r15) 2: la %r12,__LC_RETURN_PSW br %r14 -cleanup_sysc_leave_insn: +cleanup_sysc_restore_insn: .long sysc_done - 4 + 0x80000000 .long sysc_done - 8 + 0x80000000 -cleanup_io_return: +cleanup_io_tif: mvc __LC_RETURN_PSW(4),0(%r12) - mvc __LC_RETURN_PSW+4(4),BASED(cleanup_table_io_work_loop) + mvc __LC_RETURN_PSW+4(4),BASED(cleanup_table_io_tif) la %r12,__LC_RETURN_PSW br %r14 -cleanup_io_leave: - clc 4(4,%r12),BASED(cleanup_io_leave_insn) +cleanup_io_restore: + clc 4(4,%r12),BASED(cleanup_io_restore_insn) be BASED(2f) mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER - clc 4(4,%r12),BASED(cleanup_io_leave_insn+4) + clc 4(4,%r12),BASED(cleanup_io_restore_insn+4) be BASED(2f) mvc __LC_RETURN_PSW(8),SP_PSW(%r15) c %r12,BASED(.Lmck_old_psw) @@ -1059,7 +1056,7 @@ cleanup_io_leave: l %r15,SP_R15(%r15) 2: la %r12,__LC_RETURN_PSW br %r14 -cleanup_io_leave_insn: +cleanup_io_restore_insn: .long io_done - 4 + 0x80000000 .long io_done - 8 + 0x80000000 diff -uprN linux-2.6.32/arch/s390/kernel/ftrace.c linux-2.6.32.ovz/arch/s390/kernel/ftrace.c --- linux-2.6.32/arch/s390/kernel/ftrace.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/ftrace.c 2015-03-10 13:24:02.000000000 -0700 @@ -203,8 +203,8 @@ out: #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; +extern struct syscall_metadata *__start_syscalls_metadata[]; +extern struct syscall_metadata *__stop_syscalls_metadata[]; extern unsigned int sys_call_table[]; static struct syscall_metadata **syscalls_metadata; @@ -217,7 +217,7 @@ struct syscall_metadata *syscall_nr_to_m return syscalls_metadata[nr]; } -int syscall_name_to_nr(char *name) +int syscall_name_to_nr(const char *name) { int i; @@ -242,16 +242,16 @@ void set_syscall_exit_id(int num, int id static struct syscall_metadata *find_syscall_meta(unsigned long syscall) { - struct syscall_metadata *start; - struct syscall_metadata *stop; + struct syscall_metadata **start; + struct syscall_metadata **stop; char str[KSYM_SYMBOL_LEN]; - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; + start = __start_syscalls_metadata; + stop = __stop_syscalls_metadata; kallsyms_lookup(syscall, NULL, NULL, NULL, str); for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name + 3, str + 3)) + if ((*start)->name && !strcmp((*start)->name + 3, str + 3)) return start; } return NULL; diff -uprN linux-2.6.32/arch/s390/kernel/head31.S linux-2.6.32.ovz/arch/s390/kernel/head31.S --- linux-2.6.32/arch/s390/kernel/head31.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/head31.S 2015-03-10 13:24:08.000000000 -0700 @@ -15,8 +15,6 @@ startup_continue: basr %r13,0 # get base .LPG1: - - mvi __LC_AR_MODE_ID,0 # set ESA flag (mode 0) lctl %c0,%c15,.Lctl-.LPG1(%r13) # load control registers l %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area # move IPL device to lowcore diff -uprN linux-2.6.32/arch/s390/kernel/head64.S linux-2.6.32.ovz/arch/s390/kernel/head64.S --- linux-2.6.32/arch/s390/kernel/head64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/head64.S 2015-03-10 13:24:08.000000000 -0700 @@ -79,10 +79,10 @@ startup_continue: #else lhi %r1,1 # mode 1 = esame (normal ipl) #endif /* CONFIG_ZFCPDUMP */ - mvi __LC_AR_MODE_ID,1 # set esame flag - slr %r0,%r0 # set cpuid to zero - sigp %r1,%r0,0x12 # switch to esame mode + /* Continue with 64bit startup code in head64.S */ sam64 # switch to 64 bit mode + llgfr %r13,%r13 # clear high-order half of base reg + lmh %r0,%r15,.Lzero64-.LPG1(%r13) # clear high-order half lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area # move IPL device to lowcore @@ -122,11 +122,12 @@ startup_continue: .quad 0 # cr12: tracing off .quad 0 # cr13: home space segment table .quad 0xc0000000 # cr14: machine check handling off - .quad 0 # cr15: linkage stack operations + .quad .Llinkage_stack # cr15: linkage stack operations .Lpcmsk:.quad 0x0000000180000000 .L4malign:.quad 0xffffffffffc00000 .Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8 .Lnop: .long 0x07000700 +.Lzero64:.fill 16,4,0x0 #ifdef CONFIG_ZFCPDUMP .Lcurrent_cpu: .long 0x0 @@ -136,12 +137,15 @@ startup_continue: .Lparmaddr: .quad PARMAREA .align 64 -.Lduct: .long 0,0,0,0,.Lduald,0,0,0 +.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 .long 0,0,0,0,0,0,0,0 +.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 .align 128 .Lduald:.rept 8 .long 0x80000000,0,0,0 # invalid access-list entries .endr +.Llinkage_stack: + .long 0,0,0x89000000,0,0,0,0x8a000000,0 .org 0x12000 .globl _ehead diff -uprN linux-2.6.32/arch/s390/kernel/head_kdump.S linux-2.6.32.ovz/arch/s390/kernel/head_kdump.S --- linux-2.6.32/arch/s390/kernel/head_kdump.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/head_kdump.S 2015-03-10 13:23:04.000000000 -0700 @@ -0,0 +1,119 @@ +/* + * S390 kdump lowlevel functions (new kernel) + * + * Copyright IBM Corp. 2011 + * Author(s): Michael Holzheu + */ + +#define DATAMOVER_ADDR 0x4000 +#define COPY_PAGE_ADDR 0x6000 + +#ifdef CONFIG_CRASH_DUMP + +# +# kdump entry (new kernel - not yet relocated) +# +# Note: This code has to be position independent +# + +.align 2 +.Lep_startup_kdump: + basr %r13,0 +.Lbase: + lhi %r1,2 # mode 2 = esame (dump) + sigp %r1,%r0,0x12 # Switch to esame mode + sam64 # Switch to 64 bit addressing + larl %r2,.Lbase_addr # Check, if we have been + lg %r2,0(%r2) # already relocated: + clgr %r2,%r13 # + jne .Lrelocate # No : Start data mover + lghi %r2,0 # Yes: Start kdump kernel + brasl %r14,startup_kdump_relocated + +.Lrelocate: + larl %r4,startup + lg %r2,0x418(%r4) # Get kdump base + lg %r3,0x420(%r4) # Get kdump size + + larl %r10,.Lcopy_start # Source of data mover + lghi %r8,DATAMOVER_ADDR # Target of data mover + mvc 0(256,%r8),0(%r10) # Copy data mover code + + agr %r8,%r2 # Copy data mover to + mvc 0(256,%r8),0(%r10) # reserved mem + + lghi %r14,DATAMOVER_ADDR # Jump to copied data mover + basr %r14,%r14 +.Lbase_addr: + .quad .Lbase + +# +# kdump data mover code (runs at address DATAMOVER_ADDR) +# +# r2: kdump base address +# r3: kdump size +# +.Lcopy_start: + basr %r13,0 # Base +0: + lgr %r11,%r2 # Save kdump base address + lgr %r12,%r2 + agr %r12,%r3 # Compute kdump end address + + lghi %r5,0 + lghi %r10,COPY_PAGE_ADDR # Load copy page address +1: + mvc 0(256,%r10),0(%r5) # Copy old kernel to tmp + mvc 0(256,%r5),0(%r11) # Copy new kernel to old + mvc 0(256,%r11),0(%r10) # Copy tmp to new + aghi %r11,256 + aghi %r5,256 + clgr %r11,%r12 + jl 1b + + lg %r14,.Lstartup_kdump-0b(%r13) + basr %r14,%r14 # Start relocated kernel +.Lstartup_kdump: + .long 0x00000000,0x00000000 + startup_kdump_relocated +.Lcopy_end: + +# +# Startup of kdump (relocated new kernel) +# +.align 2 +startup_kdump_relocated: + basr %r13,0 +0: + mvc 0(8,%r0),.Lrestart_psw-0b(%r13) # Setup restart PSW + mvc 464(16,%r0),.Lpgm_psw-0b(%r13) # Setup pgm check PSW + lhi %r1,1 # Start new kernel + diag %r1,%r1,0x308 # with diag 308 + +.Lno_diag308: # No diag 308 + sam31 # Switch to 31 bit addr mode + sr %r1,%r1 # Erase register r1 + sr %r2,%r2 # Erase register r2 + sigp %r1,%r2,0x12 # Switch to 31 bit arch mode + lpsw 0 # Start new kernel... +.align 8 +.Lrestart_psw: + .long 0x00080000,0x80000000 + startup +.Lpgm_psw: + .quad 0x0000000180000000,0x0000000000000000 + .Lno_diag308 +#else +.align 2 +.Lep_startup_kdump: +#ifdef CONFIG_64BIT + larl %r13,startup_kdump_crash + lpswe 0(%r13) +.align 8 +startup_kdump_crash: + .quad 0x0002000080000000,0x0000000000000000 + startup_kdump_crash +#else + basr %r13,0 +0: lpsw startup_kdump_crash-0b(%r13) +.align 8 +startup_kdump_crash: + .long 0x000a0000,0x00000000 + startup_kdump_crash +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_CRASH_DUMP */ diff -uprN linux-2.6.32/arch/s390/kernel/head.S linux-2.6.32.ovz/arch/s390/kernel/head.S --- linux-2.6.32/arch/s390/kernel/head.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/head.S 2015-03-10 13:24:08.000000000 -0700 @@ -172,7 +172,7 @@ __HEAD .long 0x02000370,0x60000050 # the channel program the PSW .long 0x020003c0,0x60000050 # at location 0 is loaded. .long 0x02000410,0x60000050 # Initial processing starts - .long 0x02000460,0x60000050 # at 0xf0 = iplstart. + .long 0x02000460,0x60000050 # at 0x200 = iplstart. .long 0x020004b0,0x60000050 .long 0x02000500,0x60000050 .long 0x02000550,0x60000050 @@ -182,11 +182,37 @@ __HEAD .long 0x02000690,0x60000050 .long 0x020006e0,0x20000050 - .org 0xf0 + .org 0x200 + +# +# subroutine to wait for end I/O +# +.Lirqwait: +#ifdef CONFIG_64BIT + mvc 0x1f0(16),.Lnewpsw # set up IO interrupt psw + lpsw .Lwaitpsw +.Lioint: + br %r14 + .align 8 +.Lnewpsw: + .quad 0x0000000080000000,.Lioint +#else + mvc 0x78(8),.Lnewpsw # set up IO interrupt psw + lpsw .Lwaitpsw +.Lioint: + br %r14 + .align 8 +.Lnewpsw: + .long 0x00080000,0x80000000+.Lioint +#endif +.Lwaitpsw: + .long 0x020a0000,0x80000000+.Lioint + # # subroutine for loading cards from the reader # .Lloader: + la %r4,0(%r14) la %r3,.Lorb # r2 = address of orb into r2 la %r5,.Lirb # r4 = address of irb la %r6,.Lccws @@ -203,9 +229,7 @@ __HEAD ssch 0(%r3) # load chunk of 1600 bytes bnz .Llderr .Lwait4irq: - mvc 0x78(8),.Lnewpsw # set up IO interrupt psw - lpsw .Lwaitpsw -.Lioint: + bas %r14,.Lirqwait c %r1,0xb8 # compare subchannel number bne .Lwait4irq tsch 0(%r5) @@ -224,7 +248,7 @@ __HEAD sr %r0,%r3 # #ccws*80-residual=#bytes read ar %r2,%r0 - br %r14 # r2 contains the total size + br %r4 # r2 contains the total size .Lcont: ahi %r2,0x640 # add 0x640 to total size @@ -248,10 +272,6 @@ __HEAD .Lloadp:.long 0,0 .align 8 .Lcrash:.long 0x000a0000,0x00000000 -.Lnewpsw: - .long 0x00080000,0x80000000+.Lioint -.Lwaitpsw: - .long 0x020a0000,0x80000000+.Lioint .align 8 .Lccws: .rept 19 @@ -260,7 +280,30 @@ __HEAD .long 0x02200050,0x00000000 #endif /* CONFIG_IPL_VM */ +#endif /* CONFIG_IPL */ + +# +# subroutine to set architecture mode +# +.Lsetmode: +#ifdef CONFIG_64BIT + mvi __LC_AR_MODE_ID,1 # set esame flag + slr %r0,%r0 # set cpuid to zero + lhi %r1,2 # mode 2 = esame (dump) + sigp %r1,%r0,0x12 # switch to esame mode + bras %r13,0f + .fill 16,4,0x0 +0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs + sam31 # switch to 31 bit addressing mode +#else + mvi __LC_AR_MODE_ID,0 # set ESA flag (mode 0) +#endif + br %r14 + +#ifdef CONFIG_IPL + iplstart: + bas %r14,.Lsetmode # Immediately switch to 64 bit mode lh %r1,0xb8 # test if subchannel number bct %r1,.Lnoload # is valid l %r1,0xb8 # load ipl subchannel number @@ -343,8 +386,8 @@ iplstart: # # reset files in VM reader # - stidp __LC_CPUID # store cpuid - tm __LC_CPUID,0xff # running VM ? + stidp .Lcpuid # store cpuid + tm .Lcpuid,0xff # running VM ? bno .Lnoreset la %r2,.Lreset lhi %r3,26 @@ -356,22 +399,14 @@ iplstart: tm 31(%r5),0xff # bits is set in the schib bz .Lnoreset .Lwaitforirq: - mvc 0x78(8),.Lrdrnewpsw # set up IO interrupt psw -.Lwaitrdrirq: - lpsw .Lrdrwaitpsw -.Lrdrint: + bas %r14,.Lirqwait # wait for IO interrupt c %r1,0xb8 # compare subchannel number - bne .Lwaitrdrirq + bne .Lwaitforirq la %r5,.Lirb tsch 0(%r5) .Lnoreset: b .Lnoload - .align 8 -.Lrdrnewpsw: - .long 0x00080000,0x80000000+.Lrdrint -.Lrdrwaitpsw: - .long 0x020a0000,0x80000000+.Lrdrint #endif # @@ -390,6 +425,8 @@ iplstart: .byte 0xc8,0xd6,0xd3,0xc4 # "change rdr all keep nohold" .L_eof: .long 0xc5d6c600 /* C'EOF' */ .L_hdr: .long 0xc8c4d900 /* C'HDR' */ + .align 8 +.Lcpuid:.fill 8,1,0 #endif /* CONFIG_IPL */ @@ -402,6 +439,7 @@ iplstart: .globl start start: stm %r0,%r15,0x07b0 # store registers + bas %r14,.Lsetmode # Immediately switch to 64 bit mode basr %r12,%r0 .base: l %r11,.parm @@ -467,10 +505,40 @@ start: # or linload or SALIPL # .org 0x10000 -startup:basr %r13,0 # get base +startup: + j .Lep_startup_normal +# +# This is a list of s390 kernel entry points. At address 0x1000f the number of +# valid entry points is stored. +# +# IMPORTANT: Do not change this table, it is s390 kernel ABI! +# + .ascii "S390EP" + .byte 0x00,0x01 +# +# kdump startup-code at 0x10010, running in 64 bit absolute addressing mode +# + .org 0x10010 +startup_kdump: + j .Lep_startup_kdump +.Lep_startup_normal: +#ifdef CONFIG_64BIT + mvi __LC_AR_MODE_ID,1 # set esame flag + slr %r0,%r0 # set cpuid to zero + lhi %r1,2 # mode 2 = esame (dump) + sigp %r1,%r0,0x12 # switch to esame mode + bras %r13,0f + .fill 16,4,0x0 +0: lmh %r0,%r15,0(%r13) # clear high-order half of gprs + sam31 # switch to 31 bit addressing mode +#else + mvi __LC_AR_MODE_ID,0 # set ESA flag (mode 0) +#endif + basr %r13,0 # get base .LPG0: xc 0x200(256),0x200 # partially clear lowcore xc 0x300(256),0x300 + xc 0xe00(256),0xe00 l %r1,5f-.LPG0(%r13) stck 0(%r1) spt 6f-.LPG0(%r13) @@ -540,6 +608,8 @@ startup:basr %r13,0 # get base .align 8 6: .long 0x7fffffff,0xffffffff +#include "head_kdump.S" + # # params at 10400 (setup.h) # @@ -547,6 +617,8 @@ startup:basr %r13,0 # get base .long 0,0 # IPL_DEVICE .long 0,0 # INITRD_START .long 0,0 # INITRD_SIZE + .long 0,0 # OLDMEM_BASE + .long 0,0 # OLDMEM_SIZE .org COMMAND_LINE .byte "root=/dev/ram0 ro" diff -uprN linux-2.6.32/arch/s390/kernel/ipl.c linux-2.6.32.ovz/arch/s390/kernel/ipl.c --- linux-2.6.32/arch/s390/kernel/ipl.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/ipl.c 2015-03-10 13:23:55.000000000 -0700 @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include #include @@ -44,11 +46,13 @@ * - halt * - power off * - reipl + * - restart */ #define ON_PANIC_STR "on_panic" #define ON_HALT_STR "on_halt" #define ON_POFF_STR "on_poff" #define ON_REIPL_STR "on_reboot" +#define ON_RESTART_STR "on_restart" struct shutdown_action; struct shutdown_trigger { @@ -402,8 +406,9 @@ static ssize_t sys_ipl_device_show(struc static struct kobj_attribute sys_ipl_device_attr = __ATTR(device, S_IRUGO, sys_ipl_device_show, NULL); -static ssize_t ipl_parameter_read(struct kobject *kobj, struct bin_attribute *attr, - char *buf, loff_t off, size_t count) +static ssize_t ipl_parameter_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) { return memory_read_from_buffer(buf, count, &off, IPL_PARMBLOCK_START, IPL_PARMBLOCK_SIZE); @@ -418,8 +423,9 @@ static struct bin_attribute ipl_paramete .read = &ipl_parameter_read, }; -static ssize_t ipl_scp_data_read(struct kobject *kobj, struct bin_attribute *attr, - char *buf, loff_t off, size_t count) +static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, char *buf, + loff_t off, size_t count) { unsigned int size = IPL_PARMBLOCK_START->ipl_info.fcp.scp_data_len; void *scp_data = &IPL_PARMBLOCK_START->ipl_info.fcp.scp_data; @@ -688,7 +694,7 @@ static struct kobj_attribute sys_reipl_c /* FCP reipl device attributes */ -static ssize_t reipl_fcp_scpdata_read(struct kobject *kobj, +static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { @@ -698,7 +704,7 @@ static ssize_t reipl_fcp_scpdata_read(st return memory_read_from_buffer(buf, count, &off, scp_data, size); } -static ssize_t reipl_fcp_scpdata_write(struct kobject *kobj, +static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { @@ -1369,6 +1375,16 @@ static struct kobj_attribute dump_type_a static struct kset *dump_kset; +static void diag308_dump(void *dump_block) +{ + diag308(DIAG308_SET, dump_block); + while (1) { + if (diag308(DIAG308_DUMP, NULL) != 0x302) + break; + udelay_simple(USEC_PER_SEC); + } +} + static void dump_run(struct shutdown_trigger *trigger) { struct ccw_dev_id devid; @@ -1389,12 +1405,10 @@ static void dump_run(struct shutdown_tri __cpcmd(buf, NULL, 0, NULL); break; case DUMP_METHOD_CCW_DIAG: - diag308(DIAG308_SET, dump_block_ccw); - diag308(DIAG308_DUMP, NULL); + diag308_dump(dump_block_ccw); break; case DUMP_METHOD_FCP_DIAG: - diag308(DIAG308_SET, dump_block_fcp); - diag308(DIAG308_DUMP, NULL); + diag308_dump(dump_block_fcp); break; case DUMP_METHOD_NONE: return; @@ -1526,17 +1540,20 @@ static char vmcmd_on_reboot[128]; static char vmcmd_on_panic[128]; static char vmcmd_on_halt[128]; static char vmcmd_on_poff[128]; +static char vmcmd_on_restart[128]; DEFINE_IPL_ATTR_STR_RW(vmcmd, on_reboot, "%s\n", "%s\n", vmcmd_on_reboot); DEFINE_IPL_ATTR_STR_RW(vmcmd, on_panic, "%s\n", "%s\n", vmcmd_on_panic); DEFINE_IPL_ATTR_STR_RW(vmcmd, on_halt, "%s\n", "%s\n", vmcmd_on_halt); DEFINE_IPL_ATTR_STR_RW(vmcmd, on_poff, "%s\n", "%s\n", vmcmd_on_poff); +DEFINE_IPL_ATTR_STR_RW(vmcmd, on_restart, "%s\n", "%s\n", vmcmd_on_restart); static struct attribute *vmcmd_attrs[] = { &sys_vmcmd_on_reboot_attr.attr, &sys_vmcmd_on_panic_attr.attr, &sys_vmcmd_on_halt_attr.attr, &sys_vmcmd_on_poff_attr.attr, + &sys_vmcmd_on_restart_attr.attr, NULL, }; @@ -1558,6 +1575,8 @@ static void vmcmd_run(struct shutdown_tr cmd = vmcmd_on_halt; else if (strcmp(trigger->name, ON_POFF_STR) == 0) cmd = vmcmd_on_poff; + else if (strcmp(trigger->name, ON_RESTART_STR) == 0) + cmd = vmcmd_on_restart; else return; @@ -1593,7 +1612,8 @@ static struct shutdown_action vmcmd_acti static void stop_run(struct shutdown_trigger *trigger) { - if (strcmp(trigger->name, ON_PANIC_STR) == 0) + if (strcmp(trigger->name, ON_PANIC_STR) == 0 || + strcmp(trigger->name, ON_RESTART_STR) == 0) disabled_wait((unsigned long) __builtin_return_address(0)); while (signal_processor(smp_processor_id(), sigp_stop) == sigp_busy) cpu_relax(); @@ -1685,10 +1705,46 @@ static struct kobj_attribute on_panic_at static void do_panic(void) { + lgr_info_log(); on_panic_trigger.action->fn(&on_panic_trigger); stop_run(&on_panic_trigger); } +/* on restart */ + +static struct shutdown_trigger on_restart_trigger = {ON_RESTART_STR, + &stop_action}; + +static ssize_t on_restart_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%s\n", on_restart_trigger.action->name); +} + +static ssize_t on_restart_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t len) +{ + return set_trigger(buf, &on_restart_trigger, len); +} + +static struct kobj_attribute on_restart_attr = + __ATTR(on_restart, 0644, on_restart_show, on_restart_store); + +void do_restart(void) +{ + tracing_off(); + debug_locks_off(); + lgr_info_log(); + smp_restart_with_online_cpu(); + smp_send_stop(); +#ifdef CONFIG_CRASH_DUMP + crash_kexec(NULL); +#endif + on_restart_trigger.action->fn(&on_restart_trigger); + stop_run(&on_restart_trigger); +} + /* on halt */ static struct shutdown_trigger on_halt_trigger = {ON_HALT_STR, &stop_action}; @@ -1765,7 +1821,9 @@ static void __init shutdown_triggers_ini if (sysfs_create_file(&shutdown_actions_kset->kobj, &on_poff_attr.attr)) goto fail; - + if (sysfs_create_file(&shutdown_actions_kset->kobj, + &on_restart_attr.attr)) + goto fail; return; fail: panic("shutdown_triggers_init failed\n"); @@ -1939,17 +1997,25 @@ void unregister_reset_call(struct reset_ } EXPORT_SYMBOL_GPL(unregister_reset_call); +extern void do_reset_diag308(void); + static void do_reset_calls(void) { struct reset_call *reset; +#ifdef CONFIG_64BIT + if (diag308_set_works) { + do_reset_diag308(); + return; + } +#endif list_for_each_entry(reset, &rcall, list) reset->fn(); } u32 dump_prefix_page; -void s390_reset_system(void) +void s390_reset_system(void (*func)(void *), void *data) { struct _lowcore *lc; @@ -1977,6 +2043,10 @@ void s390_reset_system(void) S390_lowcore.program_new_psw.addr = PSW_ADDR_AMODE | (unsigned long) s390_base_pgm_handler; + /* Store status at absolute zero */ + store_status(); + do_reset_calls(); + if (func) + func(data); } - diff -uprN linux-2.6.32/arch/s390/kernel/kprobes.c linux-2.6.32.ovz/arch/s390/kernel/kprobes.c --- linux-2.6.32/arch/s390/kernel/kprobes.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/kprobes.c 2015-03-10 13:21:35.000000000 -0700 @@ -62,6 +62,8 @@ int __kprobes is_prohibited_opcode(kprob case 0x0b: /* bsm */ case 0x83: /* diag */ case 0x44: /* ex */ + case 0xac: /* stnsm */ + case 0xad: /* stosm */ return -EINVAL; } switch (*(__u16 *) instruction) { @@ -71,6 +73,7 @@ int __kprobes is_prohibited_opcode(kprob case 0xb258: /* bsg */ case 0xb218: /* pc */ case 0xb228: /* pt */ + case 0xb98d: /* epsw */ return -EINVAL; } return 0; diff -uprN linux-2.6.32/arch/s390/kernel/lgr.c linux-2.6.32.ovz/arch/s390/kernel/lgr.c --- linux-2.6.32/arch/s390/kernel/lgr.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/lgr.c 2015-03-10 13:23:23.000000000 -0700 @@ -0,0 +1,195 @@ +/* + * Linux Guest Relocation (LGR) detection + * + * Copyright IBM Corp. 2012 + * Author(s): Michael Holzheu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define LGR_TIMER_INTERVAL_SECS (30 * 60) +#define VM_LEVEL_MAX 2 /* Maximum is 8, but we only record two levels */ + +/* + * LGR info: Contains stfle and stsi data + */ +struct lgr_info { + /* Bit field with facility information: 4 DWORDs are stored */ + u64 stfle_fac_list[4]; + /* Level of system (1 = CEC, 2 = LPAR, 3 = z/VM */ + u32 level; + /* Level 1: CEC info (stsi 1.1.1) */ + char manufacturer[16]; + char type[4]; + char sequence[16]; + char plant[4]; + char model[16]; + /* Level 2: LPAR info (stsi 2.2.2) */ + u16 lpar_number; + char name[8]; + /* Level 3: VM info (stsi 3.2.2) */ + u8 vm_count; + struct { + char name[8]; + char cpi[16]; + } vm[VM_LEVEL_MAX]; +} __packed __aligned(8); + +/* + * LGR globals + */ +static char lgr_page[PAGE_SIZE] __aligned(PAGE_SIZE); +static struct lgr_info lgr_info_last; +static struct lgr_info lgr_info_cur; +static struct debug_info *lgr_dbf; + +/* + * Return number of valid stsi levels + */ +static inline int stsi_0(void) +{ + int rc = stsi(NULL, 0, 0, 0); + + return rc == -ENOSYS ? rc : (((unsigned int) rc) >> 28); +} + +/* + * Copy buffer and then convert it to ASCII + */ +static void cpascii(char *dst, char *src, int size) +{ + memcpy(dst, src, size); + EBCASC(dst, size); +} + +/* + * Fill LGR info with 1.1.1 stsi data + */ +static void lgr_stsi_1_1_1(struct lgr_info *lgr_info) +{ + struct sysinfo_1_1_1 *si = (void *) lgr_page; + + if (stsi(si, 1, 1, 1) == -ENOSYS) + return; + cpascii(lgr_info->manufacturer, si->manufacturer, + sizeof(si->manufacturer)); + cpascii(lgr_info->type, si->type, sizeof(si->type)); + cpascii(lgr_info->model, si->model, sizeof(si->model)); + cpascii(lgr_info->sequence, si->sequence, sizeof(si->sequence)); + cpascii(lgr_info->plant, si->plant, sizeof(si->plant)); +} + +/* + * Fill LGR info with 2.2.2 stsi data + */ +static void lgr_stsi_2_2_2(struct lgr_info *lgr_info) +{ + struct sysinfo_2_2_2 *si = (void *) lgr_page; + + if (stsi(si, 2, 2, 2) == -ENOSYS) + return; + cpascii(lgr_info->name, si->name, sizeof(si->name)); + memcpy(&lgr_info->lpar_number, &si->lpar_number, + sizeof(lgr_info->lpar_number)); +} + +/* + * Fill LGR info with 3.2.2 stsi data + */ +static void lgr_stsi_3_2_2(struct lgr_info *lgr_info) +{ + struct sysinfo_3_2_2 *si = (void *) lgr_page; + int i; + + if (stsi(si, 3, 2, 2) == -ENOSYS) + return; + for (i = 0; i < min_t(u8, si->count, VM_LEVEL_MAX); i++) { + cpascii(lgr_info->vm[i].name, si->vm[i].name, + sizeof(si->vm[i].name)); + cpascii(lgr_info->vm[i].cpi, si->vm[i].cpi, + sizeof(si->vm[i].cpi)); + } + lgr_info->vm_count = si->count; +} + +/* + * Fill LGR info with current data + */ +static void lgr_info_get(struct lgr_info *lgr_info) +{ + memset(lgr_info, 0, sizeof(*lgr_info)); + stfle(lgr_info->stfle_fac_list, ARRAY_SIZE(lgr_info->stfle_fac_list)); + lgr_info->level = stsi_0(); + if (lgr_info->level == -ENOSYS) + return; + if (lgr_info->level >= 1) + lgr_stsi_1_1_1(lgr_info); + if (lgr_info->level >= 2) + lgr_stsi_2_2_2(lgr_info); + if (lgr_info->level >= 3) + lgr_stsi_3_2_2(lgr_info); +} + +/* + * Check if LGR info has changed and if yes log new LGR info to s390dbf + */ +void lgr_info_log(void) +{ + static DEFINE_SPINLOCK(lgr_info_lock); + unsigned long flags; + + if (!spin_trylock_irqsave(&lgr_info_lock, flags)) + return; + lgr_info_get(&lgr_info_cur); + if (memcmp(&lgr_info_last, &lgr_info_cur, sizeof(lgr_info_cur)) != 0) { + debug_event(lgr_dbf, 1, &lgr_info_cur, sizeof(lgr_info_cur)); + lgr_info_last = lgr_info_cur; + } + spin_unlock_irqrestore(&lgr_info_lock, flags); +} +EXPORT_SYMBOL_GPL(lgr_info_log); + +static void lgr_timer_set(void); + +/* + * LGR timer callback + */ +static void lgr_timer_fn(unsigned long ignored) +{ + lgr_info_log(); + lgr_timer_set(); +} + +static struct timer_list lgr_timer = + TIMER_INITIALIZER(lgr_timer_fn, 0, 0); + +/* + * Setup next LGR timer + */ +static void lgr_timer_set(void) +{ + mod_timer(&lgr_timer, jiffies + LGR_TIMER_INTERVAL_SECS * HZ); +} + +/* + * Initialize LGR: Add s390dbf, write initial lgr_info and setup timer + */ +static int __init lgr_init(void) +{ + lgr_dbf = debug_register("lgr", 1, 1, sizeof(struct lgr_info)); + if (!lgr_dbf) + return -ENOMEM; + debug_register_view(lgr_dbf, &debug_hex_ascii_view); + lgr_info_get(&lgr_info_last); + debug_event(lgr_dbf, 1, &lgr_info_last, sizeof(lgr_info_last)); + lgr_timer_set(); + return 0; +} +module_init(lgr_init); diff -uprN linux-2.6.32/arch/s390/kernel/machine_kexec.c linux-2.6.32.ovz/arch/s390/kernel/machine_kexec.c --- linux-2.6.32/arch/s390/kernel/machine_kexec.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/machine_kexec.c 2015-03-10 13:23:55.000000000 -0700 @@ -1,10 +1,11 @@ /* * arch/s390/kernel/machine_kexec.c * - * Copyright IBM Corp. 2005,2006 + * Copyright IBM Corp. 2005,2011 * * Author(s): Rolf Adelsberger, * Heiko Carstens + * Michael Holzheu */ #include @@ -12,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -20,12 +22,192 @@ #include #include #include +#include +#include typedef void (*relocate_kernel_t)(kimage_entry_t *, unsigned long); extern const unsigned char relocate_kernel[]; extern const unsigned long long relocate_kernel_len; +#ifdef CONFIG_CRASH_DUMP + +void *fill_cpu_elf_notes(void *ptr, struct save_area_s390x *sa); + +/* + * Create ELF notes for one CPU + */ +static void add_elf_notes(int cpu) +{ + struct save_area_s390x *sa = (void *) 4608 + store_prefix(); + void *ptr; + + memcpy((void *) (4608UL + sa->pref_reg), sa, sizeof(*sa)); + ptr = (u64 *) per_cpu_ptr(crash_notes, cpu); + ptr = fill_cpu_elf_notes(ptr, sa); + memset(ptr, 0, sizeof(struct elf_note)); +} + +/* + * Store status of next available physical CPU + */ +static int store_status_next(int start_cpu, int this_cpu) +{ + struct save_area_s390x *sa = (void *) 4608 + store_prefix(); + int cpu, rc; + + for (cpu = start_cpu; cpu < 65536; cpu++) { + if (cpu == this_cpu) + continue; + do { + rc = raw_sigp(cpu, sigp_stop_and_store_status); + } while (rc == sigp_busy); + if (rc != sigp_order_code_accepted) + continue; + if (sa->pref_reg) + return cpu; + } + return -1; +} + +/* + * Initialize CPU ELF notes + */ +void setup_regs(void) +{ + unsigned long sa = S390_lowcore.prefixreg_save_area + SAVE_AREA_BASE; + int cpu, this_cpu, phys_cpu = 0, first = 1; + + this_cpu = stap(); + + if (!S390_lowcore.prefixreg_save_area) + first = 0; + for_each_online_cpu(cpu) { + if (first) { + add_elf_notes(cpu); + first = 0; + continue; + } + phys_cpu = store_status_next(phys_cpu, this_cpu); + if (phys_cpu == -1) + break; + add_elf_notes(cpu); + phys_cpu++; + } + /* Copy dump CPU store status info to absolute zero */ + memcpy((void *) SAVE_AREA_BASE, (void *) sa, + sizeof(struct save_area_s390x)); +} + +/* + * PM notifier callback for kdump + */ +static int machine_kdump_pm_cb(struct notifier_block *nb, unsigned long action, + void *ptr) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + if (crashk_res.start) + crash_map_reserved_pages(); + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + if (crashk_res.start) + crash_unmap_reserved_pages(); + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static int __init machine_kdump_pm_init(void) +{ + pm_notifier(machine_kdump_pm_cb, 0); + return 0; +} +arch_initcall(machine_kdump_pm_init); +#endif + +/* + * Start kdump: We expect here that a store status has been done on our CPU + */ +static void __do_machine_kdump(void *image) +{ +#ifdef CONFIG_CRASH_DUMP + int (*start_kdump)(int) = (void *)((struct kimage *) image)->start; + + setup_regs(); + __load_psw_mask(PSW_BASE_BITS | PSW_DEFAULT_KEY); + start_kdump(1); +#endif +} + +/* + * Check if kdump checksums are valid: We call purgatory with parameter "0" + */ +static int kdump_csum_valid(struct kimage *image) +{ +#ifdef CONFIG_CRASH_DUMP + int (*start_kdump)(int) = (void *)image->start; + int rc; + + __raw_local_irq_stnsm(0xfb); /* disable DAT */ + rc = start_kdump(0); + __raw_local_irq_stosm(0x04); /* enable DAT */ + return rc ? 0 : -EINVAL; +#else + return -EINVAL; +#endif +} + +/* + * Map or unmap crashkernel memory + */ +static void crash_map_pages(int enable) +{ + unsigned long size = resource_size(&crashk_res); + + BUG_ON(crashk_res.start % KEXEC_CRASH_MEM_ALIGN || + size % KEXEC_CRASH_MEM_ALIGN); + if (enable) + vmem_add_mapping(crashk_res.start, size); + else + vmem_remove_mapping(crashk_res.start, size); +} + +/* + * Map crashkernel memory + */ +void crash_map_reserved_pages(void) +{ + crash_map_pages(1); +} + +/* + * Unmap crashkernel memory + */ +void crash_unmap_reserved_pages(void) +{ + crash_map_pages(0); +} + +/* + * Give back memory to hypervisor before new kdump is loaded + */ +static int machine_kexec_prepare_kdump(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (MACHINE_IS_VM) + diag10_range(PFN_DOWN(crashk_res.start), + PFN_DOWN(crashk_res.end - crashk_res.start + 1)); + return 0; +#else + return -EINVAL; +#endif +} + int machine_kexec_prepare(struct kimage *image) { void *reboot_code_buffer; @@ -34,6 +216,9 @@ int machine_kexec_prepare(struct kimage if (ipl_flags & IPL_NSS_VALID) return -ENOSYS; + if (image->type == KEXEC_TYPE_CRASH) + return machine_kexec_prepare_kdump(); + /* We don't support anything but the default image type for now. */ if (image->type != KEXEC_TYPE_DEFAULT) return -EINVAL; @@ -50,21 +235,48 @@ void machine_kexec_cleanup(struct kimage { } +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_SYMBOL(lowcore_ptr); + VMCOREINFO_SYMBOL(high_memory); + VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); +} + void machine_shutdown(void) { } -void machine_kexec(struct kimage *image) +/* + * Do normal kexec + */ +static void __do_machine_kexec(void *data) { relocate_kernel_t data_mover; - - smp_send_stop(); - pfault_fini(); - s390_reset_system(); + struct kimage *image = data; data_mover = (relocate_kernel_t) page_to_phys(image->control_code_page); /* Call the moving routine */ (*data_mover)(&image->head, image->start); - for (;;); +} + +/* + * Reset system and call either kdump or normal kexec + */ +void machine_kexec(struct kimage *image) +{ + if (image->type == KEXEC_TYPE_CRASH && !kdump_csum_valid(image)) + return; + + smp_send_stop(); + pfault_fini(); + tracing_off(); + debug_locks_off(); + if (image->type == KEXEC_TYPE_CRASH) { + lgr_info_log(); + s390_reset_system(__do_machine_kdump, image); + } else { + s390_reset_system(__do_machine_kexec, image); + } + disabled_wait((unsigned long) __builtin_return_address(0)); } diff -uprN linux-2.6.32/arch/s390/kernel/Makefile linux-2.6.32.ovz/arch/s390/kernel/Makefile --- linux-2.6.32/arch/s390/kernel/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/Makefile 2015-03-10 13:23:55.000000000 -0700 @@ -23,7 +23,7 @@ CFLAGS_sysinfo.o += -Iinclude/math-emu - obj-y := bitmap.o traps.o time.o process.o base.o early.o setup.o \ processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o \ s390_ext.o debug.o irq.o ipl.o dis.o diag.o mem_detect.o \ - vdso.o vtime.o sysinfo.o nmi.o sclp.o + vdso.o vtime.o sysinfo.o nmi.o sclp.o lgr.o obj-y += $(if $(CONFIG_64BIT),entry64.o,entry.o) obj-y += $(if $(CONFIG_64BIT),reipl64.o,reipl.o) @@ -44,6 +44,10 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_FUNCTION_TRACER) += $(if $(CONFIG_64BIT),mcount64.o,mcount.o) obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o +obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +obj-$(CONFIG_64BIT) += runtime_instr.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf.o # Kexec part S390_KEXEC_OBJS := machine_kexec.o crash.o diff -uprN linux-2.6.32/arch/s390/kernel/mem_detect.c linux-2.6.32.ovz/arch/s390/kernel/mem_detect.c --- linux-2.6.32/arch/s390/kernel/mem_detect.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/mem_detect.c 2015-03-10 13:23:04.000000000 -0700 @@ -62,3 +62,84 @@ void detect_memory_layout(struct mem_chu __raw_local_irq_ssm(flags); } EXPORT_SYMBOL(detect_memory_layout); + +/* + * Move memory chunks array from index "from" to index "to" + */ +static void mem_chunk_move(struct mem_chunk chunk[], int to, int from) +{ + int cnt = MEMORY_CHUNKS - to; + + memmove(&chunk[to], &chunk[from], cnt * sizeof(struct mem_chunk)); +} + +/* + * Initialize memory chunk + */ +static void mem_chunk_init(struct mem_chunk *chunk, unsigned long addr, + unsigned long size, int type) +{ + chunk->type = type; + chunk->addr = addr; + chunk->size = size; +} + +/* + * Create memory hole with given address, size, and type + */ +void create_mem_hole(struct mem_chunk chunk[], unsigned long addr, + unsigned long size, int type) +{ + unsigned long lh_start, lh_end, lh_size, ch_start, ch_end, ch_size; + int i, ch_type; + + for (i = 0; i < MEMORY_CHUNKS; i++) { + if (chunk[i].size == 0) + continue; + + /* Define chunk properties */ + ch_start = chunk[i].addr; + ch_size = chunk[i].size; + ch_end = ch_start + ch_size - 1; + ch_type = chunk[i].type; + + /* Is memory chunk hit by memory hole? */ + if (addr + size <= ch_start) + continue; /* No: memory hole in front of chunk */ + if (addr > ch_end) + continue; /* No: memory hole after chunk */ + + /* Yes: Define local hole properties */ + lh_start = max(addr, chunk[i].addr); + lh_end = min(addr + size - 1, ch_end); + lh_size = lh_end - lh_start + 1; + + if (lh_start == ch_start && lh_end == ch_end) { + /* Hole covers complete memory chunk */ + mem_chunk_init(&chunk[i], lh_start, lh_size, type); + } else if (lh_end == ch_end) { + /* Hole starts in memory chunk and convers chunk end */ + mem_chunk_move(chunk, i + 1, i); + mem_chunk_init(&chunk[i], ch_start, ch_size - lh_size, + ch_type); + mem_chunk_init(&chunk[i + 1], lh_start, lh_size, type); + i += 1; + } else if (lh_start == ch_start) { + /* Hole ends in memory chunk */ + mem_chunk_move(chunk, i + 1, i); + mem_chunk_init(&chunk[i], lh_start, lh_size, type); + mem_chunk_init(&chunk[i + 1], lh_end + 1, + ch_size - lh_size, ch_type); + break; + } else { + /* Hole splits memory chunk */ + mem_chunk_move(chunk, i + 2, i); + mem_chunk_init(&chunk[i], ch_start, + lh_start - ch_start, ch_type); + mem_chunk_init(&chunk[i + 1], lh_start, lh_size, type); + mem_chunk_init(&chunk[i + 2], lh_end + 1, + ch_end - lh_end, ch_type); + break; + } + } +} diff -uprN linux-2.6.32/arch/s390/kernel/module.c linux-2.6.32.ovz/arch/s390/kernel/module.c --- linux-2.6.32/arch/s390/kernel/module.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/module.c 2015-03-10 13:23:55.000000000 -0700 @@ -55,8 +55,10 @@ void *module_alloc(unsigned long size) /* Free memory returned from module_alloc */ void module_free(struct module *mod, void *module_region) { - vfree(mod->arch.syminfo); - mod->arch.syminfo = NULL; + if (mod) { + vfree(mod->arch.syminfo); + mod->arch.syminfo = NULL; + } vfree(module_region); } @@ -201,6 +203,8 @@ apply_rela(Elf_Rela *rela, Elf_Addr base val = symtab[r_sym].st_value; switch (r_type) { + case R_390_NONE: /* No relocation. */ + break; case R_390_8: /* Direct 8 bit. */ case R_390_12: /* Direct 12 bit. */ case R_390_16: /* Direct 16 bit. */ diff -uprN linux-2.6.32/arch/s390/kernel/nmi.c linux-2.6.32.ovz/arch/s390/kernel/nmi.c --- linux-2.6.32/arch/s390/kernel/nmi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/nmi.c 2015-03-10 13:22:07.000000000 -0700 @@ -95,7 +95,6 @@ EXPORT_SYMBOL_GPL(s390_handle_mcck); static int notrace s390_revalidate_registers(struct mci *mci) { int kill_task; - u64 tmpclock; u64 zero; void *fpt_save_area, *fpt_creg_save_area; @@ -214,11 +213,10 @@ static int notrace s390_revalidate_regis : "0", "cc"); #endif /* Revalidate clock comparator register */ - asm volatile( - " stck 0(%1)\n" - " sckc 0(%1)" - : "=m" (tmpclock) : "a" (&(tmpclock)) : "cc", "memory"); - + if (S390_lowcore.clock_comparator == -1) + set_clock_comparator(get_clock()); + else + set_clock_comparator(S390_lowcore.clock_comparator); /* Check if old PSW is valid */ if (!mci->wp) /* diff -uprN linux-2.6.32/arch/s390/kernel/perf_cpum_cf.c linux-2.6.32.ovz/arch/s390/kernel/perf_cpum_cf.c --- linux-2.6.32/arch/s390/kernel/perf_cpum_cf.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/perf_cpum_cf.c 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,692 @@ +/* + * Performance event support for s390x - CPU-measurement Counter Facility + * + * Copyright IBM Corp. 2012 + * Author(s): Hendrik Brueckner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + */ +#define KMSG_COMPONENT "cpum_cf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* CPU-measurement counter facility supports these CPU counter sets: + * For CPU counter sets: + * Basic counter set: 0-31 + * Problem-state counter set: 32-63 + * Crypto-activity counter set: 64-127 + * Extented counter set: 128-159 + */ +enum cpumf_ctr_set { + /* CPU counter sets */ + CPUMF_CTR_SET_BASIC = 0, + CPUMF_CTR_SET_USER = 1, + CPUMF_CTR_SET_CRYPTO = 2, + CPUMF_CTR_SET_EXT = 3, + + /* Maximum number of counter sets */ + CPUMF_CTR_SET_MAX, +}; + +#define CPUMF_LCCTL_ENABLE_SHIFT 16 +#define CPUMF_LCCTL_ACTCTL_SHIFT 0 +static const u64 cpumf_state_ctl[CPUMF_CTR_SET_MAX] = { + [CPUMF_CTR_SET_BASIC] = 0x02, + [CPUMF_CTR_SET_USER] = 0x04, + [CPUMF_CTR_SET_CRYPTO] = 0x08, + [CPUMF_CTR_SET_EXT] = 0x01, +}; + +static void ctr_set_enable(u64 *state, int ctr_set) +{ + *state |= cpumf_state_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT; +} +static void ctr_set_disable(u64 *state, int ctr_set) +{ + *state &= ~(cpumf_state_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT); +} +static void ctr_set_start(u64 *state, int ctr_set) +{ + *state |= cpumf_state_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT; +} +static void ctr_set_stop(u64 *state, int ctr_set) +{ + *state &= ~(cpumf_state_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT); +} + +/* Local CPUMF event structure */ +struct cpu_hw_events { + struct cpumf_ctr_info info; + atomic_t ctr_set[CPUMF_CTR_SET_MAX]; + u64 state, tx_state; + unsigned int flags; +}; +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .ctr_set = { + [CPUMF_CTR_SET_BASIC] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_USER] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_CRYPTO] = ATOMIC_INIT(0), + [CPUMF_CTR_SET_EXT] = ATOMIC_INIT(0), + }, + .state = 0, + .flags = 0, +}; + +static int get_counter_set(u64 event) +{ + int set = -1; + + if (event < 32) + set = CPUMF_CTR_SET_BASIC; + else if (event < 64) + set = CPUMF_CTR_SET_USER; + else if (event < 128) + set = CPUMF_CTR_SET_CRYPTO; + else if (event < 256) + set = CPUMF_CTR_SET_EXT; + + return set; +} + +static int validate_event(const struct hw_perf_event *hwc) +{ + switch (hwc->config_base) { + case CPUMF_CTR_SET_BASIC: + case CPUMF_CTR_SET_USER: + case CPUMF_CTR_SET_CRYPTO: + case CPUMF_CTR_SET_EXT: + /* check for reserved counters */ + if ((hwc->config >= 6 && hwc->config <= 31) || + (hwc->config >= 38 && hwc->config <= 63) || + (hwc->config >= 80 && hwc->config <= 127)) + return -EOPNOTSUPP; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int validate_ctr_version(const struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuhw; + int err = 0; + + cpuhw = &get_cpu_var(cpu_hw_events); + + /* check required version for counter sets */ + switch (hwc->config_base) { + case CPUMF_CTR_SET_BASIC: + case CPUMF_CTR_SET_USER: + if (cpuhw->info.cfvn < 1) + err = -EOPNOTSUPP; + break; + case CPUMF_CTR_SET_CRYPTO: + case CPUMF_CTR_SET_EXT: + if (cpuhw->info.csvn < 1) + err = -EOPNOTSUPP; + if ((cpuhw->info.csvn == 1 && hwc->config > 159) || + (cpuhw->info.csvn == 2 && hwc->config > 175) || + (cpuhw->info.csvn > 2 && hwc->config > 255)) + err = -EOPNOTSUPP; + break; + } + + put_cpu_var(cpu_hw_events); + return err; +} + +static int validate_ctr_auth(const struct hw_perf_event *hwc) +{ + struct cpu_hw_events *cpuhw; + u64 ctrs_state; + int err = 0; + + cpuhw = &get_cpu_var(cpu_hw_events); + + /* check authorization for cpu counter sets */ + ctrs_state = cpumf_state_ctl[hwc->config_base]; + if (!(ctrs_state & cpuhw->info.auth_ctl)) + err = -EPERM; + + put_cpu_var(cpu_hw_events); + return err; +} + +/* + * Change the CPUMF state to active. + * Enable and activate the CPU-counter sets according + * to the per-cpu control state. + */ +static void cpumf_pmu_enable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + int err; + + if (cpuhw->flags & PMU_F_ENABLED) + return; + + err = lcctl(cpuhw->state); + if (err) { + pr_err("Enabling the performance measuring unit " + "failed with rc=%x\n", err); + return; + } + + cpuhw->flags |= PMU_F_ENABLED; +} + +/* + * Change the CPUMF state to inactive. + * Disable and enable (inactive) the CPU-counter sets according + * to the per-cpu control state. + */ +static void cpumf_pmu_disable(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + int err; + u64 inactive; + + if (!(cpuhw->flags & PMU_F_ENABLED)) + return; + + inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + err = lcctl(inactive); + if (err) { + pr_err("Disabling the performance measuring unit " + "failed with rc=%x\n", err); + return; + } + + cpuhw->flags &= ~PMU_F_ENABLED; +} + + +/* Number of perf events counting hardware events */ +static atomic_t num_events = ATOMIC_INIT(0); +/* Used to avoid races in calling reserve/release_cpumf_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* CPU-measurement alerts for the counter facility */ +static void cpumf_measurement_alert(__u16 ext_code) +{ + struct cpu_hw_events *cpuhw; + + if (!(S390_lowcore.ext_params & CPU_MF_INT_CF_MASK)) + return; + + cpuhw = &__get_cpu_var(cpu_hw_events); + + /* Measurement alerts are shared and might happen when the PMU + * is not reserved. Ignore these alerts in this case. */ + if (!(cpuhw->flags & PMU_F_RESERVED)) + return; + + /* counter authorization change alert */ + if (S390_lowcore.ext_params & CPU_MF_INT_CF_CACA) + qctri(&cpuhw->info); + + /* loss of counter data alert */ + if (S390_lowcore.ext_params & CPU_MF_INT_CF_LCDA) + pr_err("CPU[%i] Counter data was lost\n", smp_processor_id()); +} + +#define PMC_INIT 0 +#define PMC_RELEASE 1 +static void setup_pmc_cpu(void *flags) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + switch (*((int *) flags)) { + case PMC_INIT: + memset(&cpuhw->info, 0, sizeof(cpuhw->info)); + qctri(&cpuhw->info); + cpuhw->flags |= PMU_F_RESERVED; + break; + + case PMC_RELEASE: + cpuhw->flags &= ~PMU_F_RESERVED; + break; + } + + /* Disable CPU counter sets */ + lcctl(0); +} + +/* Initialize the CPU-measurement facility */ +static int reserve_pmc_hardware(void) +{ + int flags = PMC_INIT; + + on_each_cpu(setup_pmc_cpu, &flags, 1); + measurement_alert_subclass_register(); + + return 0; +} + +/* Release the CPU-measurement facility */ +static void release_pmc_hardware(void) +{ + int flags = PMC_RELEASE; + + on_each_cpu(setup_pmc_cpu, &flags, 1); + measurement_alert_subclass_unregister(); +} + +/* Release the PMU if event is the last perf event */ +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +/* CPUMF <-> perf event mappings for kernel+userspace (basic set) */ +static const int cpumf_generic_events_basic[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 0, + [PERF_COUNT_HW_INSTRUCTIONS] = 1, + [PERF_COUNT_HW_CACHE_REFERENCES] = -1, + [PERF_COUNT_HW_CACHE_MISSES] = -1, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, + [PERF_COUNT_HW_BRANCH_MISSES] = -1, + [PERF_COUNT_HW_BUS_CYCLES] = -1, +}; +/* CPUMF <-> perf event mappings for userspace (problem-state set) */ +static const int cpumf_generic_events_user[] = { + [PERF_COUNT_HW_CPU_CYCLES] = 32, + [PERF_COUNT_HW_INSTRUCTIONS] = 33, + [PERF_COUNT_HW_CACHE_REFERENCES] = -1, + [PERF_COUNT_HW_CACHE_MISSES] = -1, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = -1, + [PERF_COUNT_HW_BRANCH_MISSES] = -1, + [PERF_COUNT_HW_BUS_CYCLES] = -1, +}; + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + int err; + u64 ev; + + switch (attr->type) { + case PERF_TYPE_RAW: + /* Raw events are used to access counters directly, + * hence do not permit excludes */ + if (attr->exclude_kernel || attr->exclude_user || + attr->exclude_hv) + return -EOPNOTSUPP; + ev = attr->config; + break; + + case PERF_TYPE_HARDWARE: + ev = attr->config; + /* Count user space (problem-state) only */ + if (!attr->exclude_user && attr->exclude_kernel) { + if (ev >= ARRAY_SIZE(cpumf_generic_events_user)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_user[ev]; + + /* No support for kernel space counters only */ + } else if (!attr->exclude_kernel && attr->exclude_user) { + return -EOPNOTSUPP; + + /* Count user and kernel space */ + } else { + if (ev >= ARRAY_SIZE(cpumf_generic_events_basic)) + return -EOPNOTSUPP; + ev = cpumf_generic_events_basic[ev]; + } + break; + + default: + return -ENOENT; + } + + if (ev == -1) + return -ENOENT; + + if (ev >= PERF_CPUM_CF_MAX_CTR) + return -EINVAL; + + /* Use the hardware perf event structure to store the counter number + * in 'config' member and the counter set to which the counter belongs + * in the 'config_base'. The counter set (config_base) is then used + * to enable/disable the counters. + */ + hwc->config = ev; + hwc->config_base = get_counter_set(ev); + + /* Validate the counter that is assigned to this event. + * Because the counter facility can use numerous counters at the + * same time without constraints, it is not necessary to explicity + * validate event groups (event->group_leader != event). + */ + err = validate_event(hwc); + if (err) + return err; + + /* Initialize for using the CPU-measurement counter facility */ + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + } + event->destroy = hw_perf_event_destroy; + + /* Finally, validate version and authorization of the counter set */ + err = validate_ctr_auth(hwc); + if (!err) + err = validate_ctr_version(hwc); + + return err; +} + +static int cpumf_pmu_event_init(struct perf_event *event) +{ + int err; + + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + case PERF_TYPE_RAW: + /* The CPU measurement counter facility does not have overflow + * interrupts to do sampling. Sampling must be provided by + * external means, for example, by timers. + */ + if (is_sampling_event(event)) + return -ENOENT; + err = __hw_perf_event_init(event); + break; + default: + return -ENOENT; + } + + if (unlikely(err) && event->destroy) + event->destroy(event); + + return err; +} + +static int hw_perf_event_reset(struct perf_event *event) +{ + u64 prev, new; + int err; + + do { + prev = local64_read(&event->hw.prev_count); + err = ecctr(event->hw.config, &new); + if (err) { + if (err != 3) + break; + /* The counter is not (yet) available. This + * might happen if the counter set to which + * this counter belongs is in the disabled + * state. + */ + new = 0; + } + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + return err; +} + +static int hw_perf_event_update(struct perf_event *event) +{ + u64 prev, new, delta; + int err; + + do { + prev = local64_read(&event->hw.prev_count); + err = ecctr(event->hw.config, &new); + if (err) + goto out; + } while (local64_cmpxchg(&event->hw.prev_count, prev, new) != prev); + + delta = (prev <= new) ? new - prev + : (-1ULL - prev) + new + 1; /* overflow */ + local64_add(delta, &event->count); +out: + return err; +} + +static void cpumf_pmu_read(struct perf_event *event) +{ + if (event->hw.state & PERF_HES_STOPPED) + return; + + hw_perf_event_update(event); +} + +static void cpumf_pmu_start(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(hwc->config == -1)) + return; + + if (flags & PERF_EF_RELOAD) + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; + + /* (Re-)enable and activate the counter set */ + ctr_set_enable(&cpuhw->state, hwc->config_base); + ctr_set_start(&cpuhw->state, hwc->config_base); + + /* The counter set to which this counter belongs can be already active. + * Because all counters in a set are active, the event->hw.prev_count + * needs to be synchronized. At this point, the counter set can be in + * the inactive or disabled state. + */ + hw_perf_event_reset(event); + + /* increment refcount for this counter set */ + atomic_inc(&cpuhw->ctr_set[hwc->config_base]); +} + +static void cpumf_pmu_stop(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + if (!(hwc->state & PERF_HES_STOPPED)) { + /* Decrement reference count for this counter set and if this + * is the last used counter in the set, clear activation + * control and set the counter set state to inactive. + */ + if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base])) + ctr_set_stop(&cpuhw->state, hwc->config_base); + event->hw.state |= PERF_HES_STOPPED; + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + hw_perf_event_update(event); + event->hw.state |= PERF_HES_UPTODATE; + } +} + +static int cpumf_pmu_add(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + /* Check authorization for the counter set to which this + * counter belongs. + * For group events transaction, the authorization check is + * done in cpumf_pmu_commit_txn(). + */ + if (!(cpuhw->flags & PERF_EVENT_TXN)) + if (validate_ctr_auth(&event->hw)) + return -EPERM; + + ctr_set_enable(&cpuhw->state, event->hw.config_base); + event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + cpumf_pmu_start(event, PERF_EF_RELOAD); + + perf_event_update_userpage(event); + + return 0; +} + +static void cpumf_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + cpumf_pmu_stop(event, PERF_EF_UPDATE); + + /* Check if any counter in the counter set is still used. If not used, + * change the counter set to the disabled state. This also clears the + * content of all counters in the set. + * + * When a new perf event has been added but not yet started, this can + * clear enable control and resets all counters in a set. Therefore, + * cpumf_pmu_start() always has to reenable a counter set. + */ + if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base])) + ctr_set_disable(&cpuhw->state, event->hw.config_base); + + perf_event_update_userpage(event); +} + +/* + * Start group events scheduling transaction. + * Set flags to perform a single test at commit time. + */ +static void cpumf_pmu_start_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + perf_pmu_disable(pmu); + cpuhw->flags |= PERF_EVENT_TXN; + cpuhw->tx_state = cpuhw->state; +} + +/* + * Stop and cancel a group events scheduling tranctions. + * Assumes cpumf_pmu_del() is called for each successful added + * cpumf_pmu_add() during the transaction. + */ +static void cpumf_pmu_cancel_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + + WARN_ON(cpuhw->tx_state != cpuhw->state); + + cpuhw->flags &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); +} + +/* + * Commit the group events scheduling transaction. On success, the + * transaction is closed. On error, the transaction is kept open + * until cpumf_pmu_cancel_txn() is called. + */ +static int cpumf_pmu_commit_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + u64 state; + + /* check if the updated state can be scheduled */ + state = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + state >>= CPUMF_LCCTL_ENABLE_SHIFT; + if ((state & cpuhw->info.auth_ctl) != state) + return -EPERM; + + cpuhw->flags &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); + return 0; +} + +/* Performance monitoring unit for s390x */ +static struct pmu cpumf_pmu = { + .pmu_enable = cpumf_pmu_enable, + .pmu_disable = cpumf_pmu_disable, + .event_init = cpumf_pmu_event_init, + .add = cpumf_pmu_add, + .del = cpumf_pmu_del, + .start = cpumf_pmu_start, + .stop = cpumf_pmu_stop, + .read = cpumf_pmu_read, + .start_txn = cpumf_pmu_start_txn, + .commit_txn = cpumf_pmu_commit_txn, + .cancel_txn = cpumf_pmu_cancel_txn, +}; + +static int __cpuinit cpumf_pmu_notifier(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (long) hcpu; + int flags; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + flags = PMC_INIT; + smp_call_function_single(cpu, setup_pmc_cpu, &flags, 1); + break; + case CPU_DOWN_PREPARE: + flags = PMC_RELEASE; + smp_call_function_single(cpu, setup_pmc_cpu, &flags, 1); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static int __init cpumf_pmu_init(void) +{ + int rc; + + if (!cpum_cf_avail()) + return -ENODEV; + + /* clear bit 15 of cr0 to unauthorize problem-state to + * extract measurement counters */ + ctl_clear_bit(0, 48); + + /* register handler for measurement-alert interruptions */ + rc = register_external_interrupt(0x1407, cpumf_measurement_alert); + if (rc) { + pr_err("Registering for CPU-measurement alerts " + "failed with rc=%i\n", rc); + goto out; + } + + rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", PERF_TYPE_RAW); + if (rc) { + pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + unregister_external_interrupt(0x1407, cpumf_measurement_alert); + goto out; + } + perf_cpu_notifier(cpumf_pmu_notifier); +out: + return rc; +} +early_initcall(cpumf_pmu_init); diff -uprN linux-2.6.32/arch/s390/kernel/perf_event.c linux-2.6.32.ovz/arch/s390/kernel/perf_event.c --- linux-2.6.32/arch/s390/kernel/perf_event.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/perf_event.c 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,125 @@ +/* + * Performance event support for s390x + * + * Copyright IBM Corp. 2012 + * Author(s): Hendrik Brueckner + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License (version 2 only) + * as published by the Free Software Foundation. + */ +#define KMSG_COMPONENT "perf" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char *perf_pmu_name(void) +{ + if (cpum_cf_avail() || cpum_sf_avail()) + return "CPU-measurement facilities (CPUMF)"; + return "pmu"; +} +EXPORT_SYMBOL(perf_pmu_name); + +int perf_num_counters(void) +{ + int num = 0; + + if (cpum_cf_avail()) + num += PERF_CPUM_CF_MAX_CTR; + + return num; +} +EXPORT_SYMBOL(perf_num_counters); + +void perf_event_print_debug(void) +{ + struct cpumf_ctr_info cf_info; + unsigned long flags; + int cpu; + + if (!cpum_cf_avail()) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + memset(&cf_info, 0, sizeof(cf_info)); + if (!qctri(&cf_info)) { + pr_info("CPU[%i] CPUM_CF: ver=%u.%u A=%04x E=%04x C=%04x\n", + cpu, cf_info.cfvn, cf_info.csvn, + cf_info.auth_ctl, cf_info.enable_ctl, cf_info.act_ctl); + print_hex_dump_bytes("CPUMF Query: ", DUMP_PREFIX_OFFSET, + &cf_info, sizeof(cf_info)); + } + + local_irq_restore(flags); +} + +/* See also arch/s390/kernel/traps.c */ +static unsigned long __store_trace(struct perf_callchain_entry *entry, + unsigned long sp, + unsigned long low, unsigned long high) +{ + struct stack_frame *sf; + struct pt_regs *regs; + + while (1) { + sp = sp & PSW_ADDR_INSN; + if (sp < low || sp > high - sizeof(*sf)) + return sp; + sf = (struct stack_frame *) sp; + perf_callchain_store(entry, sf->gprs[8] & PSW_ADDR_INSN); + /* Follow the backchain. */ + while (1) { + low = sp; + sp = sf->back_chain & PSW_ADDR_INSN; + if (!sp) + break; + if (sp <= low || sp > high - sizeof(*sf)) + return sp; + sf = (struct stack_frame *) sp; + perf_callchain_store(entry, + sf->gprs[8] & PSW_ADDR_INSN); + } + /* Zero backchain detected, check for interrupt frame. */ + sp = (unsigned long) (sf + 1); + if (sp <= low || sp > high - sizeof(*regs)) + return sp; + regs = (struct pt_regs *) sp; + perf_callchain_store(entry, sf->gprs[8] & PSW_ADDR_INSN); + low = sp; + sp = regs->gprs[15]; + } +} + +void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ + unsigned long head; + struct stack_frame *head_sf; + + if (user_mode(regs)) + return; + + head = regs->gprs[15]; + head_sf = (struct stack_frame *) head; + + if (!head_sf || !head_sf->back_chain) + return; + + head = head_sf->back_chain; + head = __store_trace(entry, head, S390_lowcore.async_stack - ASYNC_SIZE, + S390_lowcore.async_stack); + + __store_trace(entry, head, S390_lowcore.thread_info, + S390_lowcore.thread_info + THREAD_SIZE); +} diff -uprN linux-2.6.32/arch/s390/kernel/process.c linux-2.6.32.ovz/arch/s390/kernel/process.c --- linux-2.6.32/arch/s390/kernel/process.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/process.c 2015-03-10 13:24:04.000000000 -0700 @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -41,6 +43,7 @@ #include #include #include +#include #include "entry.h" asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); @@ -149,6 +152,7 @@ EXPORT_SYMBOL(kernel_thread); */ void exit_thread(void) { + exit_thread_runtime_instr(); } void flush_thread(void) @@ -189,12 +193,18 @@ int copy_thread(unsigned long clone_flag /* Save access registers to new thread structure. */ save_access_regs(&p->thread.acrs[0]); + /* Don't copy runtime instrumentation info */ + p->thread.ri_cb = NULL; + p->thread.ri_signum = 0; + frame->childregs.psw.mask &= ~PSW_MASK_RI; + #ifndef CONFIG_64BIT /* * save fprs to current->thread.fp_regs to merge them with * the emulated registers and then copy the result to the child. */ - save_fp_regs(¤t->thread.fp_regs); + save_fp_ctl(¤t->thread.fp_regs.fpc); + save_fp_regs(current->thread.fp_regs.fprs); memcpy(&p->thread.fp_regs, ¤t->thread.fp_regs, sizeof(s390_fp_regs)); /* Set a new TLS ? */ @@ -202,7 +212,9 @@ int copy_thread(unsigned long clone_flag p->thread.acrs[0] = regs->gprs[6]; #else /* CONFIG_64BIT */ /* Save the fpu registers to new thread structure. */ - save_fp_regs(&p->thread.fp_regs); + save_fp_ctl(&p->thread.fp_regs.fpc); + save_fp_regs(p->thread.fp_regs.fprs); + p->thread.fp_regs.pad = 0; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { if (is_compat_task()) { @@ -272,14 +284,14 @@ SYSCALL_DEFINE3(execve, char __user *, n char __user * __user *, envp) { struct pt_regs *regs = task_pt_regs(current); - char *filename; + struct filename *filename; long rc; filename = getname(name); rc = PTR_ERR(filename); if (IS_ERR(filename)) return rc; - rc = do_execve(filename, argv, envp, regs); + rc = do_execve(filename->name, argv, envp, regs); if (rc) goto out; execve_tail(); @@ -299,10 +311,12 @@ int dump_fpu (struct pt_regs * regs, s39 * save fprs to current->thread.fp_regs to merge them with * the emulated registers and then copy the result to the dump. */ - save_fp_regs(¤t->thread.fp_regs); + save_fp_ctl(¤t->thread.fp_regs.fpc); + save_fp_regs(current->thread.fp_regs.fprs); memcpy(fpregs, ¤t->thread.fp_regs, sizeof(s390_fp_regs)); #else /* CONFIG_64BIT */ - save_fp_regs(fpregs); + save_fp_ctl(&fpregs->fpc); + save_fp_regs(fpregs->fprs); #endif /* CONFIG_64BIT */ return 1; } @@ -331,3 +345,32 @@ unsigned long get_wchan(struct task_stru } return 0; } + +static inline unsigned long brk_rnd(void) +{ + /* 8MB for 32bit, 1GB for 64bit */ + if (is_32bit_task()) + return (get_random_int() & 0x7ffUL) << PAGE_SHIFT; + else + return (get_random_int() & 0x3ffffUL) << PAGE_SHIFT; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long ret = PAGE_ALIGN(mm->brk + brk_rnd()); + + if (ret < mm->brk) + return mm->brk; + return ret; +} + +unsigned long randomize_et_dyn(unsigned long base) +{ + unsigned long ret = PAGE_ALIGN(base + brk_rnd()); + + if (!(current->flags & PF_RANDOMIZE)) + return base; + if (ret < base) + return base; + return ret; +} diff -uprN linux-2.6.32/arch/s390/kernel/processor.c linux-2.6.32.ovz/arch/s390/kernel/processor.c --- linux-2.6.32/arch/s390/kernel/processor.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/processor.c 2015-03-10 13:23:48.000000000 -0700 @@ -31,9 +31,9 @@ void __cpuinit print_cpu_info(void) static int show_cpuinfo(struct seq_file *m, void *v) { - static const char *hwcap_str[10] = { + static const char *hwcap_str[11] = { "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", - "edat", "etf3eh", "highgprs" + "edat", "etf3eh", "highgprs", "te" }; struct _lowcore *lc; unsigned long n = (unsigned long) v - 1; @@ -48,7 +48,7 @@ static int show_cpuinfo(struct seq_file num_online_cpus(), loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ))%100); seq_puts(m, "features\t: "); - for (i = 0; i < 10; i++) + for (i = 0; i < 11; i++) if (hwcap_str[i] && (elf_hwcap & (1UL << i))) seq_printf(m, "%s ", hwcap_str[i]); seq_puts(m, "\n"); diff -uprN linux-2.6.32/arch/s390/kernel/ptrace.c linux-2.6.32.ovz/arch/s390/kernel/ptrace.c --- linux-2.6.32/arch/s390/kernel/ptrace.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/ptrace.c 2015-03-10 13:24:04.000000000 -0700 @@ -57,16 +57,48 @@ enum s390_regset { REGSET_GENERAL, REGSET_FP, + REGSET_LAST_BREAK, + REGSET_TDB, REGSET_GENERAL_EXTENDED, }; -static void -FixPerRegisters(struct task_struct *task) +void update_per_regs(struct task_struct *task) +{ + struct pt_regs *regs = task_pt_regs(task); + per_struct *per_info = (per_struct *) &task->thread.per_info; + per_cr_words cr_words; + + /* Take care of the enable/disable of transactional execution. */ + if (MACHINE_HAS_TE) { + unsigned long cr0, cr0_new; + + __ctl_store(cr0, 0, 0); + /* set or clear transaction execution bit 8. */ + if (task->thread.per_flags & PER_FLAG_NO_TE) + cr0_new = cr0 & ~(1UL << 55); + else + cr0_new = cr0 | (1UL << 55); + /* Only load control register 0 if necessary. */ + if (cr0 != cr0_new) + __ctl_load(cr0_new, 0, 0); + } + + /* Take care of the PER enablement bit in the PSW. */ + if (!(per_info->control_regs.words.cr[0] & PER_EM_MASK)) { + regs->psw.mask &= ~PSW_MASK_PER; + return; + } + regs->psw.mask |= PSW_MASK_PER; + __ctl_store(cr_words, 9, 11); + if (memcmp(&cr_words, &per_info->control_regs.words, + sizeof(cr_words)) != 0) + __ctl_load(per_info->control_regs.words, 9, 11); +} + +static void FixPerRegisters(struct task_struct *task) { - struct pt_regs *regs; per_struct *per_info; - regs = task_pt_regs(task); per_info = (per_struct *) &task->thread.per_info; per_info->control_regs.bits.em_instruction_fetch = per_info->single_step | per_info->instruction_fetch; @@ -79,25 +111,24 @@ FixPerRegisters(struct task_struct *task else #endif per_info->control_regs.bits.ending_addr = PSW_ADDR_INSN; +#ifdef CONFIG_64_BIT + per_info->control_regs.suspension_ctl = 1; + per_info->control_regs.transaction_end = 1; +#endif } else { per_info->control_regs.bits.starting_addr = per_info->starting_addr; per_info->control_regs.bits.ending_addr = per_info->ending_addr; } - /* - * if any of the control reg tracing bits are on - * we switch on per in the psw - */ - if (per_info->control_regs.words.cr[0] & PER_EM_MASK) - regs->psw.mask |= PSW_MASK_PER; - else - regs->psw.mask &= ~PSW_MASK_PER; if (per_info->control_regs.bits.em_storage_alteration) per_info->control_regs.bits.storage_alt_space_ctl = 1; else per_info->control_regs.bits.storage_alt_space_ctl = 0; + + if (task == current) + update_per_regs(task); } void user_enable_single_step(struct task_struct *task) @@ -121,6 +152,7 @@ void ptrace_disable(struct task_struct *child) { /* make sure the single step bit is not set. */ + child->thread.per_flags = 0; user_disable_single_step(child); } @@ -190,8 +222,7 @@ static unsigned long __peek_user(struct offset = addr - (addr_t) &dummy->regs.fp_regs; tmp = *(addr_t *)((addr_t) &child->thread.fp_regs + offset); if (addr == (addr_t) &dummy->regs.fp_regs.fpc) - tmp &= (unsigned long) FPC_VALID_MASK - << (BITS_PER_LONG - 32); + tmp <<= BITS_PER_LONG - 32; } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { /* @@ -293,10 +324,10 @@ static int __poke_user(struct task_struc /* * floating point regs. are stored in the thread structure */ - if (addr == (addr_t) &dummy->regs.fp_regs.fpc && - (data & ~((unsigned long) FPC_VALID_MASK - << (BITS_PER_LONG - 32))) != 0) - return -EINVAL; + if (addr == (addr_t) &dummy->regs.fp_regs.fpc) + if ((unsigned int) data != 0 || + test_fp_ctl(data >> (BITS_PER_LONG - 32))) + return -EINVAL; offset = addr - (addr_t) &dummy->regs.fp_regs; *(addr_t *)((addr_t) &child->thread.fp_regs + offset) = data; @@ -373,6 +404,20 @@ long arch_ptrace(struct task_struct *chi copied += sizeof(unsigned long); } return 0; + case PTRACE_GET_LAST_BREAK: + put_user(task_thread_info(child)->last_break, + (unsigned long __user *) data); + return 0; + case PTRACE_ENABLE_TE: + if (!MACHINE_HAS_TE) + return -EIO; + child->thread.per_flags &= ~PER_FLAG_NO_TE; + return 0; + case PTRACE_DISABLE_TE: + if (!MACHINE_HAS_TE) + return -EIO; + child->thread.per_flags |= PER_FLAG_NO_TE; + return 0; default: /* Removing high order bit from addr (only for 31 bit). */ addr &= PSW_ADDR_INSN; @@ -539,8 +584,7 @@ static int __poke_user_compat(struct tas * floating point regs. are stored in the thread structure */ if (addr == (addr_t) &dummy32->regs.fp_regs.fpc && - (tmp & ~FPC_VALID_MASK) != 0) - /* Invalid floating point control. */ + test_fp_ctl(tmp)) return -EINVAL; offset = addr - (addr_t) &dummy32->regs.fp_regs; *(__u32 *)((addr_t) &child->thread.fp_regs + offset) = tmp; @@ -625,6 +669,10 @@ long compat_arch_ptrace(struct task_stru copied += sizeof(unsigned int); } return 0; + case PTRACE_GET_LAST_BREAK: + put_user(task_thread_info(child)->last_break, + (unsigned int __user *) data); + return 0; } return compat_ptrace_request(child, request, addr, data); } @@ -632,7 +680,7 @@ long compat_arch_ptrace(struct task_stru asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) { - long ret; + long ret = 0; /* Do the secure computing check first. */ secure_computing(regs->gprs[2]); @@ -641,7 +689,6 @@ asmlinkage long do_syscall_trace_enter(s * The sysc_tracesys code in entry.S stored the system * call number to gprs[2]. */ - ret = regs->gprs[2]; if (test_thread_flag(TIF_SYSCALL_TRACE) && (tracehook_report_syscall_entry(regs) || regs->gprs[2] >= NR_syscalls)) { @@ -663,14 +710,12 @@ asmlinkage long do_syscall_trace_enter(s regs->gprs[2], regs->orig_gpr2, regs->gprs[3], regs->gprs[4], regs->gprs[5]); - return ret; + return ret ?: regs->gprs[2]; } asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) { - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->gprs[2]), - regs->gprs[2]); + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->gprs[2]); @@ -750,8 +795,10 @@ static int s390_fpregs_get(struct task_s const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - if (target == current) - save_fp_regs(&target->thread.fp_regs); + if (target == current) { + save_fp_ctl(&target->thread.fp_regs.fpc); + save_fp_regs(target->thread.fp_regs.fprs); + } return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.fp_regs, 0, -1); @@ -764,19 +811,21 @@ static int s390_fpregs_set(struct task_s { int rc = 0; - if (target == current) - save_fp_regs(&target->thread.fp_regs); + if (target == current) { + save_fp_ctl(&target->thread.fp_regs.fpc); + save_fp_regs(target->thread.fp_regs.fprs); + } /* If setting FPC, must validate it first. */ if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) { - u32 fpc[2] = { target->thread.fp_regs.fpc, 0 }; - rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &fpc, + u32 ufpc[2] = { target->thread.fp_regs.fpc, 0 }; + rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc, 0, offsetof(s390_fp_regs, fprs)); if (rc) return rc; - if ((fpc[0] & ~FPC_VALID_MASK) != 0 || fpc[1] != 0) + if (ufpc[1] != 0 || test_fp_ctl(ufpc[0])) return -EINVAL; - target->thread.fp_regs.fpc = fpc[0]; + target->thread.fp_regs.fpc = ufpc[0]; } if (rc == 0 && count > 0) @@ -784,12 +833,55 @@ static int s390_fpregs_set(struct task_s target->thread.fp_regs.fprs, offsetof(s390_fp_regs, fprs), -1); - if (rc == 0 && target == current) - restore_fp_regs(&target->thread.fp_regs); + if (rc == 0 && target == current) { + restore_fp_ctl(&target->thread.fp_regs.fpc); + restore_fp_regs(target->thread.fp_regs.fprs); + } return rc; } +#ifdef CONFIG_64BIT + +static int s390_last_break_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (count > 0) { + if (kbuf) { + unsigned long *k = kbuf; + *k = task_thread_info(target)->last_break; + } else { + unsigned long __user *u = ubuf; + if (__put_user(task_thread_info(target)->last_break, u)) + return -EFAULT; + } + } + return 0; +} + +static int s390_tdb_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + unsigned char *data; + + data = target->thread.trap_tdb; + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, 256); +} + +static int s390_tdb_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + return 0; +} + +#endif + static const struct user_regset s390_regsets[] = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, @@ -807,6 +899,23 @@ static const struct user_regset s390_reg .get = s390_fpregs_get, .set = s390_fpregs_set, }, +#ifdef CONFIG_64BIT + [REGSET_LAST_BREAK] = { + .core_note_type = NT_S390_LAST_BREAK, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .get = s390_last_break_get, + }, + [REGSET_TDB] = { + .core_note_type = NT_S390_TDB, + .n = 1, + .size = 256, + .align = 1, + .get = s390_tdb_get, + .set = s390_tdb_set, + }, +#endif }; static const struct user_regset_view user_s390_view = { @@ -941,6 +1050,27 @@ static int s390_compat_regs_high_set(str return rc; } +static int s390_compat_last_break_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + compat_ulong_t last_break; + + if (count > 0) { + last_break = task_thread_info(target)->last_break; + if (kbuf) { + unsigned long *k = kbuf; + *k = last_break; + } else { + unsigned long __user *u = ubuf; + if (__put_user(last_break, u)) + return -EFAULT; + } + } + return 0; +} + static const struct user_regset s390_compat_regsets[] = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, @@ -958,6 +1088,21 @@ static const struct user_regset s390_com .get = s390_fpregs_get, .set = s390_fpregs_set, }, + [REGSET_LAST_BREAK] = { + .core_note_type = NT_S390_LAST_BREAK, + .n = 1, + .size = sizeof(long), + .align = sizeof(long), + .get = s390_compat_last_break_get, + }, + [REGSET_TDB] = { + .core_note_type = NT_S390_TDB, + .n = 1, + .size = 256, + .align = 1, + .get = s390_tdb_get, + .set = s390_tdb_set, + }, [REGSET_GENERAL_EXTENDED] = { .core_note_type = NT_PRXSTATUS, .n = sizeof(s390_compat_regs_high) / sizeof(compat_long_t), diff -uprN linux-2.6.32/arch/s390/kernel/reipl64.S linux-2.6.32.ovz/arch/s390/kernel/reipl64.S --- linux-2.6.32/arch/s390/kernel/reipl64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/reipl64.S 2015-03-10 13:23:04.000000000 -0700 @@ -1,35 +1,85 @@ /* - * Copyright IBM Corp 2000,2009 + * Copyright IBM Corp 2000,2011 * Author(s): Holger Smolinski , * Denis Joseph Barrow, */ +#include +#include #include +#define SAVE_AREA_BASE 0x1200 + +# +# store_status +# +# Prerequisites to run this function: +# - Prefix register is set to zero +# - Original prefix register is stored in "dump_prefix_page" +# - Lowcore protection is off +# + .globl store_status +store_status: + /* Save register one and load save area base */ + stg %r1,__LC_SAVE_AREA+120(%r0) + lghi %r1,SAVE_AREA_BASE + /* General purpose registers */ + stmg %r0,%r15,__LC_GPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + lg %r2,__LC_SAVE_AREA+120(%r0) + stg %r2,__LC_GPREGS_SAVE_AREA-SAVE_AREA_BASE+8(%r1) + /* Control registers */ + stctg %c0,%c15,__LC_CREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + /* Access registers */ + stam %a0,%a15,__LC_AREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + /* Floating point registers */ + std %f0, 0x00 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f1, 0x08 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f2, 0x10 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f3, 0x18 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f4, 0x20 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f5, 0x28 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f6, 0x30 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f7, 0x38 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f8, 0x40 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f9, 0x48 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f10,0x50 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f11,0x58 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f12,0x60 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f13,0x68 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f14,0x70 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + std %f15,0x78 + __LC_FPREGS_SAVE_AREA-SAVE_AREA_BASE(%r1) + /* Floating point control register */ + stfpc __LC_FP_CREG_SAVE_AREA-SAVE_AREA_BASE(%r1) + /* CPU timer */ + stpt __LC_CPU_TIMER_SAVE_AREA-SAVE_AREA_BASE(%r1) + /* Saved prefix register */ + larl %r2,dump_prefix_page + mvc __LC_PREFIX_SAVE_AREA-SAVE_AREA_BASE(4,%r1),0(%r2) + /* Clock comparator - seven bytes */ + larl %r2,.Lclkcmp + stckc 0(%r2) + mvc __LC_CLOCK_COMP_SAVE_AREA-SAVE_AREA_BASE + 1(7,%r1),1(%r2) + /* Program status word */ + epsw %r2,%r3 + st %r2,__LC_PSW_SAVE_AREA-SAVE_AREA_BASE + 0(%r1) + st %r3,__LC_PSW_SAVE_AREA-SAVE_AREA_BASE + 4(%r1) + larl %r2,store_status + stg %r2,__LC_PSW_SAVE_AREA-SAVE_AREA_BASE + 8(%r1) + br %r14 + + .section .bss + .align 8 +.Lclkcmp: .quad 0x0000000000000000 + .previous # # do_reipl_asm # Parameter: r2 = schid of reipl device # - .globl do_reipl_asm -do_reipl_asm: basr %r13,0 +ENTRY(do_reipl_asm) + basr %r13,0 .Lpg0: lpswe .Lnewpsw-.Lpg0(%r13) -.Lpg1: # do store status of all registers - - stg %r1,.Lregsave-.Lpg0(%r13) - lghi %r1,0x1000 - stmg %r0,%r15,__LC_GPREGS_SAVE_AREA-0x1000(%r1) - lg %r0,.Lregsave-.Lpg0(%r13) - stg %r0,__LC_GPREGS_SAVE_AREA-0x1000+8(%r1) - stctg %c0,%c15,__LC_CREGS_SAVE_AREA-0x1000(%r1) - stam %a0,%a15,__LC_AREGS_SAVE_AREA-0x1000(%r1) - lg %r10,.Ldump_pfx-.Lpg0(%r13) - mvc __LC_PREFIX_SAVE_AREA-0x1000(4,%r1),0(%r10) - stfpc __LC_FP_CREG_SAVE_AREA-0x1000(%r1) - stckc .Lclkcmp-.Lpg0(%r13) - mvc __LC_CLOCK_COMP_SAVE_AREA-0x1000(7,%r1),.Lclkcmp-.Lpg0(%r13) - stpt __LC_CPU_TIMER_SAVE_AREA-0x1000(%r1) - stg %r13, __LC_PSW_SAVE_AREA-0x1000+8(%r1) +.Lpg1: brasl %r14,store_status lctlg %c6,%c6,.Lall-.Lpg0(%r13) lgr %r1,%r2 @@ -66,10 +116,7 @@ do_reipl_asm: basr %r13,0 st %r14,.Ldispsw+12-.Lpg0(%r13) lpswe .Ldispsw-.Lpg0(%r13) .align 8 -.Lclkcmp: .quad 0x0000000000000000 .Lall: .quad 0x00000000ff000000 -.Ldump_pfx: .quad dump_prefix_page -.Lregsave: .quad 0x0000000000000000 .align 16 /* * These addresses have to be 31 bit otherwise diff -uprN linux-2.6.32/arch/s390/kernel/reipl.S linux-2.6.32.ovz/arch/s390/kernel/reipl.S --- linux-2.6.32/arch/s390/kernel/reipl.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/reipl.S 2015-03-10 13:23:04.000000000 -0700 @@ -9,6 +9,12 @@ #include # +# store_status: Empty implementation until kdump is supported on 31 bit +# +ENTRY(store_status) + br %r14 + +# # do_reipl_asm # Parameter: r2 = schid of reipl device # diff -uprN linux-2.6.32/arch/s390/kernel/runtime_instr.c linux-2.6.32.ovz/arch/s390/kernel/runtime_instr.c --- linux-2.6.32/arch/s390/kernel/runtime_instr.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/runtime_instr.c 2015-03-10 13:23:48.000000000 -0700 @@ -0,0 +1,156 @@ +/* + * Copyright IBM Corp. 2012 + * Author(s): Jan Glauber + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* empty control block to disable RI by loading it */ +struct runtime_instr_cb runtime_instr_empty_cb; + +static int runtime_instr_avail(void) +{ + unsigned long long facility_bits[2]; + + if (stfle(facility_bits, 2) <= 0) + return 0; + if (facility_bits[1] & (1ULL << 63)) + return 1; + else + return 0; +} + +static void disable_runtime_instr(void) +{ + struct pt_regs *regs = task_pt_regs(current); + + load_runtime_instr_cb(&runtime_instr_empty_cb); + + /* + * Make sure the RI bit is deleted from the PSW. If the user did not + * switch off RI before the system call the process will get a + * specification exception otherwise. + */ + regs->psw.mask &= ~PSW_MASK_RI; +} + +static void init_runtime_instr_cb(struct runtime_instr_cb *cb) +{ + cb->buf_limit = 0xfff; + if (user_mode == HOME_SPACE_MODE) + cb->home_space = 1; + cb->int_requested = 1; + cb->pstate = 1; + cb->pstate_set_buf = 1; + cb->pstate_sample = 1; + cb->pstate_collect = 1; + cb->key = PAGE_DEFAULT_KEY; + cb->valid = 1; +} + +void exit_thread_runtime_instr(void) +{ + struct task_struct *task = current; + + if (!task->thread.ri_cb) + return; + disable_runtime_instr(); + kfree(task->thread.ri_cb); + task->thread.ri_signum = 0; + task->thread.ri_cb = NULL; +} + +static void runtime_instr_int_handler(__u16 ext_code) +{ + struct siginfo info; + + if (!(S390_lowcore.ext_params & CPU_MF_INT_RI_MASK)) + return; + + if (!current->thread.ri_cb) + return; + if (current->thread.ri_signum < SIGRTMIN || + current->thread.ri_signum > SIGRTMAX) { + WARN_ON_ONCE(1); + return; + } + + memset(&info, 0, sizeof(info)); + info.si_signo = current->thread.ri_signum; + info.si_code = SI_QUEUE; + if (S390_lowcore.ext_params & CPU_MF_INT_RI_BUF_FULL) + info.si_int = ENOBUFS; + else if (S390_lowcore.ext_params & CPU_MF_INT_RI_HALTED) + info.si_int = ECANCELED; + else + /* unknown reason */ + return; + + send_sig_info(current->thread.ri_signum, &info, current); +} + +SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum) +{ + struct runtime_instr_cb *cb; + + if (!runtime_instr_avail()) + return -EOPNOTSUPP; + + if (command == S390_RUNTIME_INSTR_STOP) { + preempt_disable(); + exit_thread_runtime_instr(); + preempt_enable(); + return 0; + } + + if (command != S390_RUNTIME_INSTR_START || + (signum < SIGRTMIN || signum > SIGRTMAX)) + return -EINVAL; + + if (!current->thread.ri_cb) { + cb = kzalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return -ENOMEM; + } else { + cb = current->thread.ri_cb; + memset(cb, 0, sizeof(*cb)); + } + + init_runtime_instr_cb(cb); + current->thread.ri_signum = signum; + + /* now load the control block to make it available */ + preempt_disable(); + current->thread.ri_cb = cb; + load_runtime_instr_cb(cb); + preempt_enable(); + return 0; +} + +static int __init runtime_instr_init(void) +{ + int rc; + + if (!runtime_instr_avail()) + return 0; + + measurement_alert_subclass_register(); + rc = register_external_interrupt(0x1407, runtime_instr_int_handler); + if (rc) + measurement_alert_subclass_unregister(); + else + pr_info("Runtime instrumentation facility initialized\n"); + return rc; +} +device_initcall(runtime_instr_init); diff -uprN linux-2.6.32/arch/s390/kernel/s390_ext.c linux-2.6.32.ovz/arch/s390/kernel/s390_ext.c --- linux-2.6.32/arch/s390/kernel/s390_ext.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/s390_ext.c 2015-03-10 13:23:55.000000000 -0700 @@ -125,7 +125,9 @@ void __irq_entry do_extint(struct pt_reg if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) /* Serve timer interrupts first. */ clock_comparator_work(); - kstat_cpu(smp_processor_id()).irqs[EXTERNAL_INTERRUPT]++; + kstat_incr_irqs_this_cpu(EXTERNAL_INTERRUPT, NULL); + if (code != 0x1004) + __get_cpu_var(s390_idle).nohz_delay = 1; index = ext_hash(code); for (p = ext_int_hash[index]; p; p = p->next) { if (likely(p->code == code)) @@ -137,3 +139,26 @@ void __irq_entry do_extint(struct pt_reg EXPORT_SYMBOL(register_external_interrupt); EXPORT_SYMBOL(unregister_external_interrupt); + +static DEFINE_SPINLOCK(ma_subclass_lock); +static int ma_subclass_refcount; + +void measurement_alert_subclass_register(void) +{ + spin_lock(&ma_subclass_lock); + if (!ma_subclass_refcount) + ctl_set_bit(0, 5); + ma_subclass_refcount++; + spin_unlock(&ma_subclass_lock); +} +EXPORT_SYMBOL(measurement_alert_subclass_register); + +void measurement_alert_subclass_unregister(void) +{ + spin_lock(&ma_subclass_lock); + ma_subclass_refcount--; + if (!ma_subclass_refcount) + ctl_clear_bit(0, 5); + spin_unlock(&ma_subclass_lock); +} +EXPORT_SYMBOL(measurement_alert_subclass_unregister); diff -uprN linux-2.6.32/arch/s390/kernel/setup.c linux-2.6.32.ovz/arch/s390/kernel/setup.c --- linux-2.6.32/arch/s390/kernel/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/setup.c 2015-03-10 13:23:58.000000000 -0700 @@ -2,7 +2,7 @@ * arch/s390/kernel/setup.c * * S390 version - * Copyright (C) 1999,2000 IBM Deutschland Entwicklung GmbH, IBM Corporation + * Copyright (C) IBM Corp. 1999,2010 * Author(s): Hartmut Penner (hp@de.ibm.com), * Martin Schwidefsky (schwidefsky@de.ibm.com) * @@ -43,6 +43,9 @@ #include #include #include +#include +#include +#include #include #include @@ -58,6 +61,7 @@ #include #include #include +#include long psw_kernel_bits = (PSW_BASE_BITS | PSW_MASK_DAT | PSW_ASC_PRIMARY | PSW_MASK_MCHECK | PSW_DEFAULT_KEY); @@ -87,11 +91,19 @@ unsigned long elf_hwcap = 0; char elf_platform[ELF_PLATFORM_SIZE]; struct mem_chunk __initdata memory_chunk[MEMORY_CHUNKS]; -volatile int __cpu_logical_map[NR_CPUS]; /* logical cpu to cpu address */ int __initdata memory_end_set; unsigned long __initdata memory_end; +unsigned long VMALLOC_START; +EXPORT_SYMBOL(VMALLOC_START); + +unsigned long VMALLOC_END; +EXPORT_SYMBOL(VMALLOC_END); + +struct page *vmemmap; +EXPORT_SYMBOL(vmemmap); + /* An array with a pointer to the lowcore of every CPU. */ struct _lowcore *lowcore_ptr[NR_CPUS]; EXPORT_SYMBOL(lowcore_ptr); @@ -119,6 +131,8 @@ static struct resource data_resource = { */ void __cpuinit cpu_init(void) { + struct s390_idle_data *idle = &__get_cpu_var(s390_idle); + /* * Store processor id in lowcore (used e.g. in timer_interrupt) */ @@ -134,6 +148,7 @@ void __cpuinit cpu_init(void) current->active_mm = &init_mm; BUG_ON(current->mm); enter_lazy_tlb(&init_mm, current); + memset(idle, 0, sizeof(*idle)); } /* @@ -241,6 +256,8 @@ static void __init setup_zfcpdump(unsign if (ipl_info.type != IPL_TYPE_FCP_DUMP) return; + if (OLDMEM_BASE) + return; if (console_devno != -1) sprintf(str, " cio_ignore=all,!0.0.%04x,!0.0.%04x", ipl_info.data.fcp.dev_id.devno, console_devno); @@ -305,9 +322,17 @@ static int __init early_parse_mem(char * } early_param("mem", early_parse_mem); -#ifdef CONFIG_S390_SWITCH_AMODE -unsigned int switch_amode = 0; -EXPORT_SYMBOL_GPL(switch_amode); +static int __init parse_vmalloc(char *arg) +{ + if (!arg) + return -EINVAL; + VMALLOC_END = (memparse(arg, &arg) + PAGE_SIZE - 1) & PAGE_MASK; + return 0; +} +early_param("vmalloc", parse_vmalloc); + +unsigned int user_mode = HOME_SPACE_MODE; +EXPORT_SYMBOL_GPL(user_mode); static int set_amode_and_uaccess(unsigned long user_amode, unsigned long user32_amode) @@ -340,23 +365,29 @@ static int set_amode_and_uaccess(unsigne */ static int __init early_parse_switch_amode(char *p) { - switch_amode = 1; + if (user_mode != SECONDARY_SPACE_MODE) + user_mode = PRIMARY_SPACE_MODE; return 0; } early_param("switch_amode", early_parse_switch_amode); -#else /* CONFIG_S390_SWITCH_AMODE */ -static inline int set_amode_and_uaccess(unsigned long user_amode, - unsigned long user32_amode) +static int __init early_parse_user_mode(char *p) { + if (p && strcmp(p, "primary") == 0) + user_mode = PRIMARY_SPACE_MODE; +#ifdef CONFIG_S390_EXEC_PROTECT + else if (p && strcmp(p, "secondary") == 0) + user_mode = SECONDARY_SPACE_MODE; +#endif + else if (!p || strcmp(p, "home") == 0) + user_mode = HOME_SPACE_MODE; + else + return 1; return 0; } -#endif /* CONFIG_S390_SWITCH_AMODE */ +early_param("user_mode", early_parse_user_mode); #ifdef CONFIG_S390_EXEC_PROTECT -unsigned int s390_noexec = 0; -EXPORT_SYMBOL_GPL(s390_noexec); - /* * Enable execute protection? */ @@ -364,8 +395,7 @@ static int __init early_parse_noexec(cha { if (!strncmp(p, "off", 3)) return 0; - switch_amode = 1; - s390_noexec = 1; + user_mode = SECONDARY_SPACE_MODE; return 0; } early_param("noexec", early_parse_noexec); @@ -373,7 +403,7 @@ early_param("noexec", early_parse_noexec static void setup_addressing_mode(void) { - if (s390_noexec) { + if (user_mode == SECONDARY_SPACE_MODE) { if (set_amode_and_uaccess(PSW_ASC_SECONDARY, PSW32_ASC_SECONDARY)) pr_info("Execute protection active, " @@ -381,7 +411,7 @@ static void setup_addressing_mode(void) else pr_info("Execute protection active, " "mvcos not available\n"); - } else if (switch_amode) { + } else if (user_mode == PRIMARY_SPACE_MODE) { if (set_amode_and_uaccess(PSW_ASC_PRIMARY, PSW32_ASC_PRIMARY)) pr_info("Address spaces switched, " "mvcos available\n"); @@ -389,10 +419,6 @@ static void setup_addressing_mode(void) pr_info("Address spaces switched, " "mvcos not available\n"); } -#ifdef CONFIG_TRACE_IRQFLAGS - sysc_restore_trace_psw.mask = psw_kernel_bits & ~PSW_MASK_MCHECK; - io_restore_trace_psw.mask = psw_kernel_bits & ~PSW_MASK_MCHECK; -#endif } static void __init @@ -410,8 +436,8 @@ setup_lowcore(void) memset(lc, 0, lc_pages * PAGE_SIZE); lc->restart_psw.mask = PSW_BASE_BITS | PSW_DEFAULT_KEY; lc->restart_psw.addr = - PSW_ADDR_AMODE | (unsigned long) restart_int_handler; - if (switch_amode) + PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; + if (user_mode != HOME_SPACE_MODE) lc->restart_psw.mask |= PSW_ASC_HOME; lc->external_new_psw.mask = psw_kernel_bits; lc->external_new_psw.addr = @@ -444,6 +470,7 @@ setup_lowcore(void) __ctl_set_bit(14, 29); } #else + lc->cmf_hpp = -1ULL; lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0]; #endif lc->sync_enter_timer = S390_lowcore.sync_enter_timer; @@ -473,10 +500,14 @@ setup_resources(void) for (i = 0; i < MEMORY_CHUNKS; i++) { if (!memory_chunk[i].size) continue; + if (memory_chunk[i].type == CHUNK_OLDMEM || + memory_chunk[i].type == CHUNK_CRASHK) + continue; res = alloc_bootmem_low(sizeof(struct resource)); res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; switch (memory_chunk[i].type) { case CHUNK_READ_WRITE: + case CHUNK_CRASHK: res->name = "System RAM"; break; case CHUNK_READ_ONLY: @@ -529,22 +560,19 @@ EXPORT_SYMBOL_GPL(real_memory_size); static void __init setup_memory_end(void) { - unsigned long memory_size; - unsigned long max_mem; + unsigned long vmax, vmalloc_size, tmp; int i; + #ifdef CONFIG_ZFCPDUMP - if (ipl_info.type == IPL_TYPE_FCP_DUMP) { + if (ipl_info.type == IPL_TYPE_FCP_DUMP && !OLDMEM_BASE) { memory_end = ZFCPDUMP_HSA_SIZE; memory_end_set = 1; } #endif - memory_size = 0; + real_memory_size = 0; memory_end &= PAGE_MASK; - max_mem = memory_end ? min(VMEM_MAX_PHYS, memory_end) : VMEM_MAX_PHYS; - memory_end = min(max_mem, memory_end); - /* * Make sure all chunks are MAX_ORDER aligned so we don't need the * extra checks that HOLES_IN_ZONE would require. @@ -564,23 +592,259 @@ static void __init setup_memory_end(void chunk->addr = start; chunk->size = end - start; } + real_memory_size = max(real_memory_size, + chunk->addr + chunk->size); } + /* Choose kernel address space layout: 2, 3, or 4 levels. */ +#ifdef CONFIG_64BIT + vmalloc_size = VMALLOC_END ?: 128UL << 30; + tmp = (memory_end ?: real_memory_size) / PAGE_SIZE; + tmp = tmp * (sizeof(struct page) + PAGE_SIZE) + vmalloc_size; + if (tmp <= (1UL << 42)) + vmax = 1UL << 42; /* 3-level kernel page table */ + else + vmax = 1UL << 53; /* 4-level kernel page table */ +#else + vmalloc_size = VMALLOC_END ?: 96UL << 20; + vmax = 1UL << 31; /* 2-level kernel page table */ +#endif + /* vmalloc area is at the end of the kernel address space. */ + VMALLOC_END = vmax; + VMALLOC_START = vmax - vmalloc_size; + + /* Split remaining virtual space between 1:1 mapping & vmemmap array */ + tmp = VMALLOC_START / (PAGE_SIZE + sizeof(struct page)); + tmp = VMALLOC_START - tmp * sizeof(struct page); + tmp &= ~((vmax >> 11) - 1); /* align to page table level */ + tmp = min(tmp, 1UL << MAX_PHYSMEM_BITS); + vmemmap = (struct page *) tmp; + + /* Take care that memory_end is set and <= vmemmap */ + memory_end = min(memory_end ?: real_memory_size, tmp); + + /* Fixup memory chunk array to fit into 0..memory_end */ for (i = 0; i < MEMORY_CHUNKS; i++) { struct mem_chunk *chunk = &memory_chunk[i]; - real_memory_size = max(real_memory_size, - chunk->addr + chunk->size); - if (chunk->addr >= max_mem) { + if (chunk->addr >= memory_end) { memset(chunk, 0, sizeof(*chunk)); continue; } - if (chunk->addr + chunk->size > max_mem) - chunk->size = max_mem - chunk->addr; - memory_size = max(memory_size, chunk->addr + chunk->size); + if (chunk->addr + chunk->size > memory_end) + chunk->size = memory_end - chunk->addr; + } +} + +void *restart_stack __attribute__((__section__(".data"))); + +/* + * Setup new PSW and allocate stack for PSW restart interrupt + */ +static void __init setup_restart_psw(void) +{ + psw_t psw; + + restart_stack = __alloc_bootmem(ASYNC_SIZE, ASYNC_SIZE, 0); + restart_stack += ASYNC_SIZE; + + /* + * Setup restart PSW for absolute zero lowcore. This is necesary + * if PSW restart is done on an offline CPU that has lowcore zero + */ + psw.mask = PSW_BASE_BITS | PSW_DEFAULT_KEY; + psw.addr = PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; + copy_to_absolute_zero(&S390_lowcore.restart_psw, &psw, sizeof(psw)); +} + +static void __init setup_vmcoreinfo(void) +{ +#ifdef CONFIG_KEXEC + unsigned long ptr = paddr_vmcoreinfo_note(); + + copy_to_absolute_zero(&S390_lowcore.vmcore_info, &ptr, sizeof(ptr)); +#endif +} + +#ifdef CONFIG_CRASH_DUMP + +/* + * Find suitable location for crashkernel memory + */ +static unsigned long __init find_crash_base(unsigned long crash_size, + char **msg) +{ + unsigned long crash_base; + struct mem_chunk *chunk; + int i; + + if (memory_chunk[0].size < crash_size) { + *msg = "first memory chunk must be at least crashkernel size"; + return 0; + } + if (OLDMEM_BASE && (crash_size == OLDMEM_SIZE)) + return OLDMEM_BASE; + + for (i = MEMORY_CHUNKS - 1; i >= 0; i--) { + chunk = &memory_chunk[i]; + if (chunk->size == 0) + continue; + if (chunk->type != CHUNK_READ_WRITE) + continue; + if (chunk->size < crash_size) + continue; + crash_base = (chunk->addr + chunk->size) - crash_size; + if (crash_base < crash_size) + continue; + if (crash_base < ZFCPDUMP_HSA_SIZE_MAX) + continue; + if (crash_base < (unsigned long) INITRD_START + INITRD_SIZE) + continue; + return crash_base; } - if (!memory_end) - memory_end = memory_size; + *msg = "no suitable area found"; + return 0; +} + +/* + * Check if crash_base and crash_size is valid + */ +static int __init verify_crash_base(unsigned long crash_base, + unsigned long crash_size, + char **msg) +{ + struct mem_chunk *chunk; + int i; + + /* + * Because we do the swap to zero, we must have at least 'crash_size' + * bytes free space before crash_base + */ + if (crash_size > crash_base) { + *msg = "crashkernel offset must be greater than size"; + return -EINVAL; + } + + /* First memory chunk must be at least crash_size */ + if (memory_chunk[0].size < crash_size) { + *msg = "first memory chunk must be at least crashkernel size"; + return -EINVAL; + } + /* Check if we fit into the respective memory chunk */ + for (i = 0; i < MEMORY_CHUNKS; i++) { + chunk = &memory_chunk[i]; + if (chunk->size == 0) + continue; + if (crash_base < chunk->addr) + continue; + if (crash_base >= chunk->addr + chunk->size) + continue; + /* we have found the memory chunk */ + if (crash_base + crash_size > chunk->addr + chunk->size) { + *msg = "selected memory chunk is too small for " + "crashkernel memory"; + return -EINVAL; + } + return 0; + } + *msg = "invalid memory range specified"; + return -EINVAL; +} + +/* + * Reserve kdump memory by creating a memory hole in the mem_chunk array + */ +static void __init reserve_kdump_bootmem(unsigned long addr, unsigned long size, + int type) +{ + + create_mem_hole(memory_chunk, addr, size, type); +} + +/* + * When kdump is enabled, we have to ensure that no memory from + * the area [0 - crashkernel memory size] and + * [crashk_res.start - crashk_res.end] is set offline. + */ +static int kdump_mem_notifier(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct memory_notify *arg = data; + + if (arg->start_pfn < PFN_DOWN(resource_size(&crashk_res))) + return NOTIFY_BAD; + if (arg->start_pfn > PFN_DOWN(crashk_res.end)) + return NOTIFY_OK; + if (arg->start_pfn + arg->nr_pages - 1 < PFN_DOWN(crashk_res.start)) + return NOTIFY_OK; + return NOTIFY_BAD; +} + +static struct notifier_block kdump_mem_nb = { + .notifier_call = kdump_mem_notifier, +}; + +#endif + +/* + * Make sure that oldmem, where the dump is stored, is protected + */ +static void reserve_oldmem(void) +{ +#ifdef CONFIG_CRASH_DUMP + if (!OLDMEM_BASE) + return; + + reserve_kdump_bootmem(OLDMEM_BASE, OLDMEM_SIZE, CHUNK_OLDMEM); + reserve_kdump_bootmem(OLDMEM_SIZE, memory_end - OLDMEM_SIZE, + CHUNK_OLDMEM); + if (OLDMEM_BASE + OLDMEM_SIZE == real_memory_size) + saved_max_pfn = PFN_DOWN(OLDMEM_BASE) - 1; + else + saved_max_pfn = PFN_DOWN(real_memory_size) - 1; +#endif +} + +/* + * Reserve memory for kdump kernel to be loaded with kexec + */ +static void __init reserve_crashkernel(void) +{ +#ifdef CONFIG_CRASH_DUMP + unsigned long long crash_base, crash_size; + char *msg; + int rc; + + rc = parse_crashkernel(boot_command_line, memory_end, &crash_size, + &crash_base); + if (rc || crash_size == 0) + return; + crash_base = ALIGN(crash_base, KEXEC_CRASH_MEM_ALIGN); + crash_size = ALIGN(crash_size, KEXEC_CRASH_MEM_ALIGN); + if (register_memory_notifier(&kdump_mem_nb)) + return; + if (!crash_base) + crash_base = find_crash_base(crash_size, &msg); + if (!crash_base) { + pr_info("crashkernel reservation failed: %s\n", msg); + unregister_memory_notifier(&kdump_mem_nb); + return; + } + if (verify_crash_base(crash_base, crash_size, &msg)) { + pr_info("crashkernel reservation failed: %s\n", msg); + unregister_memory_notifier(&kdump_mem_nb); + return; + } + if (!OLDMEM_BASE && MACHINE_IS_VM) + diag10_range(PFN_DOWN(crash_base), PFN_DOWN(crash_size)); + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; + insert_resource(&iomem_resource, &crashk_res); + reserve_kdump_bootmem(crash_base, crash_size, CHUNK_CRASHK); + pr_info("Reserving %lluMB of memory at %lluMB " + "for crashkernel (System RAM: %luMB)\n", + crash_size >> 20, crash_base >> 20, memory_end >> 20); +#endif } static void __init @@ -613,6 +877,14 @@ setup_memory(void) if (PFN_PHYS(start_pfn) + bmap_size > INITRD_START) { start = PFN_PHYS(start_pfn) + bmap_size + PAGE_SIZE; +#ifdef CONFIG_CRASH_DUMP + if (OLDMEM_BASE) { + /* Move initrd behind kdump oldmem */ + if (start + INITRD_SIZE > OLDMEM_BASE && + start < OLDMEM_BASE + OLDMEM_SIZE) + start = OLDMEM_BASE + OLDMEM_SIZE; + } +#endif if (start + INITRD_SIZE > memory_end) { pr_err("initrd extends beyond end of " "memory (0x%08lx > 0x%08lx) " @@ -643,7 +915,8 @@ setup_memory(void) for (i = 0; i < MEMORY_CHUNKS && memory_chunk[i].size > 0; i++) { unsigned long start_chunk, end_chunk, pfn; - if (memory_chunk[i].type != CHUNK_READ_WRITE) + if (memory_chunk[i].type != CHUNK_READ_WRITE && + memory_chunk[i].type != CHUNK_CRASHK) continue; start_chunk = PFN_DOWN(memory_chunk[i].addr); end_chunk = start_chunk + PFN_DOWN(memory_chunk[i].size); @@ -676,6 +949,15 @@ setup_memory(void) reserve_bootmem(start_pfn << PAGE_SHIFT, bootmap_size, BOOTMEM_DEFAULT); +#ifdef CONFIG_CRASH_DUMP + if (crashk_res.start) + reserve_bootmem(crashk_res.start, + crashk_res.end - crashk_res.start + 1, + BOOTMEM_DEFAULT); + if (is_kdump_kernel()) + reserve_bootmem(elfcorehdr_addr - OLDMEM_BASE, + PAGE_ALIGN(elfcorehdr_size), BOOTMEM_DEFAULT); +#endif #ifdef CONFIG_BLK_DEV_INITRD if (INITRD_START && INITRD_SIZE) { if (INITRD_START + INITRD_SIZE <= memory_end) { @@ -763,6 +1045,14 @@ static void __init setup_hwcaps(void) */ elf_hwcap |= HWCAP_S390_HIGH_GPRS; +#if defined(CONFIG_64BIT) + /* + * Transactional execution support HWCAP_S390_TE is bit 10. + */ + if (MACHINE_HAS_TE) + elf_hwcap |= HWCAP_S390_TE; +#endif + switch (S390_lowcore.cpu_id.machine) { case 0x9672: #if !defined(CONFIG_64BIT) @@ -846,8 +1136,12 @@ setup_arch(char **cmdline_p) setup_ipl(); setup_memory_end(); setup_addressing_mode(); + reserve_oldmem(); + reserve_crashkernel(); setup_memory(); setup_resources(); + setup_vmcoreinfo(); + setup_restart_psw(); setup_lowcore(); cpu_init(); diff -uprN linux-2.6.32/arch/s390/kernel/signal.c linux-2.6.32.ovz/arch/s390/kernel/signal.c --- linux-2.6.32/arch/s390/kernel/signal.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/signal.c 2015-03-10 13:24:04.000000000 -0700 @@ -122,42 +122,46 @@ static int save_sigregs(struct pt_regs * user_sregs.regs.psw.addr = regs->psw.addr; memcpy(&user_sregs.regs.gprs, ®s->gprs, sizeof(sregs->regs.gprs)); memcpy(&user_sregs.regs.acrs, current->thread.acrs, - sizeof(sregs->regs.acrs)); + sizeof(user_sregs.regs.acrs)); /* * We have to store the fp registers to current->thread.fp_regs * to merge them with the emulated registers. */ - save_fp_regs(¤t->thread.fp_regs); + save_fp_ctl(¤t->thread.fp_regs.fpc); + save_fp_regs(current->thread.fp_regs.fprs); memcpy(&user_sregs.fpregs, ¤t->thread.fp_regs, - sizeof(s390_fp_regs)); - return __copy_to_user(sregs, &user_sregs, sizeof(_sigregs)); + sizeof(user_sregs.fpregs)); + if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs))) + return -EFAULT; + return 0; } -/* Returns positive number on error */ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) { - int err; _sigregs user_sregs; /* Alwys make any pending restarted system call return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; - err = __copy_from_user(&user_sregs, sregs, sizeof(_sigregs)); - if (err) - return err; + if (__copy_from_user(&user_sregs, sregs, sizeof(user_sregs))) + return -EFAULT; + + /* Loading the floating-point-control word can fail. Do that first. */ + if (restore_fp_ctl(&user_sregs.fpregs.fpc)) + return -EINVAL; + regs->psw.mask = PSW_MASK_MERGE(regs->psw.mask, user_sregs.regs.psw.mask); regs->psw.addr = PSW_ADDR_AMODE | user_sregs.regs.psw.addr; memcpy(®s->gprs, &user_sregs.regs.gprs, sizeof(sregs->regs.gprs)); memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, - sizeof(sregs->regs.acrs)); + sizeof(current->thread.acrs)); restore_access_regs(current->thread.acrs); memcpy(¤t->thread.fp_regs, &user_sregs.fpregs, - sizeof(s390_fp_regs)); - current->thread.fp_regs.fpc &= FPC_VALID_MASK; + sizeof(current->thread.fp_regs)); - restore_fp_regs(¤t->thread.fp_regs); + restore_fp_regs(current->thread.fp_regs.fprs); regs->svcnr = 0; /* disable syscall checks */ return 0; } @@ -313,6 +317,7 @@ static int setup_frame(int sig, struct k To avoid breaking binary compatibility, they are passed as args. */ regs->gprs[4] = current->thread.trap_no; regs->gprs[5] = current->thread.prot_addr; + regs->gprs[6] = task_thread_info(current)->last_break; /* Place signal number on stack to allow backtrace from handler. */ if (__put_user(regs->gprs[2], (int __user *) &frame->signo)) @@ -376,6 +381,7 @@ static int setup_rt_frame(int sig, struc regs->gprs[2] = map_signal(sig); regs->gprs[3] = (unsigned long) &frame->info; regs->gprs[4] = (unsigned long) &frame->uc; + regs->gprs[5] = task_thread_info(current)->last_break; return 0; give_sigsegv: diff -uprN linux-2.6.32/arch/s390/kernel/smp.c linux-2.6.32.ovz/arch/s390/kernel/smp.c --- linux-2.6.32/arch/s390/kernel/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/smp.c 2015-03-10 13:23:55.000000000 -0700 @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -50,8 +51,12 @@ #include #include #include +#include #include "entry.h" +/* logical cpu to cpu address */ +int __cpu_logical_map[NR_CPUS]; + static struct task_struct *current_set[NR_CPUS]; static u8 smp_cpu_type; @@ -87,22 +92,77 @@ static int cpu_stopped(int cpu) return 0; } +/* + * Ensure that PSW restart is done on an online CPU + */ +void smp_restart_with_online_cpu(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + if (stap() == __cpu_logical_map[cpu]) { + /* We are online: Enable DAT again and return */ + __load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK); + return; + } + } + /* We are not online: Do PSW restart on an online CPU */ + cpu = any_online_cpu(cpu_online_map); + while (signal_processor(cpu, sigp_restart) == sigp_busy) + cpu_relax(); + /* And stop ourself */ + while (raw_sigp(stap(), sigp_stop) == sigp_busy) + cpu_relax(); + for (;;); +} + +static void smp_stop_cpu(void) +{ + while (signal_processor(smp_processor_id(), sigp_stop) == sigp_busy) + cpu_relax(); +} + void smp_send_stop(void) { - int cpu, rc; + cpumask_t cpumask; + int cpu; + u64 end; /* Disable all interrupts/machine checks */ __load_psw_mask(psw_kernel_bits & ~PSW_MASK_MCHECK); trace_hardirqs_off(); - /* stop all processors */ - for_each_online_cpu(cpu) { - if (cpu == smp_processor_id()) - continue; - do { - rc = signal_processor(cpu, sigp_stop); - } while (rc == sigp_busy); + debug_set_critical(); + cpumask_copy(&cpumask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &cpumask); + + if (oops_in_progress) { + /* + * Give the other cpus the opportunity to complete + * outstanding interrupts before stopping them. + */ + end = get_clock() + (1000000UL << 12); + for_each_cpu(cpu, &cpumask) { + set_bit(ec_stop_cpu, (unsigned long *) + &lowcore_ptr[cpu]->ext_call_fast); + while (signal_processor(cpu, sigp_emergency_signal) + == sigp_busy && get_clock() < end) + cpu_relax(); + } + while (get_clock() < end) { + for_each_cpu(cpu, &cpumask) + if (cpu_stopped(cpu)) + cpumask_clear_cpu(cpu, &cpumask); + if (cpumask_empty(&cpumask)) + break; + cpu_relax(); + } + } + /* stop all processors */ + for_each_cpu(cpu, &cpumask) { + while (signal_processor(cpu, sigp_stop) == sigp_busy) + cpu_relax(); while (!cpu_stopped(cpu)) cpu_relax(); } @@ -113,25 +173,27 @@ void smp_send_stop(void) * cpus are handled. */ -static void do_ext_call_interrupt(__u16 code) +static void smp_handle_ext_call(void) { unsigned long bits; - /* - * handle bit signal external calls - * - * For the ec_schedule signal we have to do nothing. All the work - * is done automatically when we return from the interrupt. - */ + /* handle bit signal external calls */ bits = xchg(&S390_lowcore.ext_call_fast, 0); - + if (test_bit(ec_schedule, &bits)) + scheduler_ipi(); + if (test_bit(ec_stop_cpu, &bits)) + smp_stop_cpu(); if (test_bit(ec_call_function, &bits)) generic_smp_call_function_interrupt(); - if (test_bit(ec_call_function_single, &bits)) generic_smp_call_function_single_interrupt(); } +static void do_ext_call_interrupt(__u16 code) +{ + smp_handle_ext_call(); +} + /* * Send an external call sigp to another cpu and return without waiting * for its completion. @@ -243,7 +305,7 @@ EXPORT_SYMBOL(smp_ctl_clear_bit); */ #define CPU_INIT_NO 1 -#ifdef CONFIG_ZFCPDUMP +#if defined(CONFIG_ZFCPDUMP) || defined(CONFIG_CRASH_DUMP) /* * zfcpdump_prefix_array holds prefix registers for the following scenario: @@ -256,7 +318,9 @@ unsigned int zfcpdump_prefix_array[NR_CP static void __init smp_get_save_area(unsigned int cpu, unsigned int phy_cpu) { - if (ipl_info.type != IPL_TYPE_FCP_DUMP) + if (ipl_info.type != IPL_TYPE_FCP_DUMP && !OLDMEM_BASE) + return; + if (is_kdump_kernel()) return; if (cpu >= NR_CPUS) { pr_warning("CPU %i exceeds the maximum %i and is excluded from " @@ -268,10 +332,12 @@ static void __init smp_get_save_area(uns while (signal_processor(CPU_INIT_NO, sigp_stop_and_store_status) == sigp_busy) cpu_relax(); - memcpy(zfcpdump_save_areas[cpu], - (void *)(unsigned long) store_prefix() + SAVE_AREA_BASE, - SAVE_AREA_SIZE); + memcpy_real(zfcpdump_save_areas[cpu], + (void *)(unsigned long) store_prefix() + SAVE_AREA_BASE, + SAVE_AREA_SIZE); #ifdef CONFIG_64BIT + if (OLDMEM_BASE) + return; /* copy original prefix register */ zfcpdump_save_areas[cpu]->s390x.pref_reg = zfcpdump_prefix_array[cpu]; #endif @@ -380,6 +446,18 @@ static void __init smp_detect_cpus(void) info = kmalloc(sizeof(*info), GFP_KERNEL); if (!info) panic("smp_detect_cpus failed to allocate memory\n"); +#ifdef CONFIG_CRASH_DUMP + if (OLDMEM_BASE && !is_kdump_kernel()) { + union save_area *save_area; + + save_area = kmalloc(sizeof(*save_area), GFP_KERNEL); + if (!save_area) + panic("could not allocate memory for save area\n"); + copy_oldmem_page(1, (void *) save_area, sizeof(*save_area), + 0x200, 0); + zfcpdump_save_areas[0] = save_area; + } +#endif /* Use sigp detection algorithm if sclp doesn't work. */ if (sclp_get_cpu_info(info)) { smp_use_sigp_detection = 1; @@ -447,6 +525,11 @@ int __cpuinit start_secondary(void *cpuv ipi_call_lock(); cpu_set(smp_processor_id(), cpu_online_map); ipi_call_unlock(); + __ctl_clear_bit(0, 28); /* Disable lowcore protection */ + S390_lowcore.restart_psw.mask = PSW_BASE_BITS | PSW_DEFAULT_KEY; + S390_lowcore.restart_psw.addr = + PSW_ADDR_AMODE | (unsigned long) psw_restart_int_handler; + __ctl_set_bit(0, 28); /* Enable lowcore protection */ /* Switch on interrupts */ local_irq_enable(); /* Print info about this processor */ @@ -486,7 +569,11 @@ static int __cpuinit smp_alloc_lowcore(i memset((char *)lowcore + 512, 0, sizeof(*lowcore) - 512); lowcore->async_stack = async_stack + ASYNC_SIZE; lowcore->panic_stack = panic_stack + PAGE_SIZE; - + lowcore->restart_psw.mask = PSW_BASE_BITS | PSW_DEFAULT_KEY; + lowcore->restart_psw.addr = + PSW_ADDR_AMODE | (unsigned long) restart_int_handler; + if (user_mode != HOME_SPACE_MODE) + lowcore->restart_psw.mask |= PSW_ASC_HOME; #ifndef CONFIG_64BIT if (MACHINE_HAS_IEEE) { unsigned long save_area; @@ -564,6 +651,7 @@ int __cpuinit __cpu_up(unsigned int cpu) sf->gprs[9] = (unsigned long) sf; cpu_lowcore->save_area[15] = (unsigned long) sf; __ctl_store(cpu_lowcore->cregs_save_area, 0, 15); + atomic_inc(&init_mm.context.attach_count); asm volatile( " stam 0,15,0(%0)" : : "a" (&cpu_lowcore->access_regs_save_area) : "memory"); @@ -606,6 +694,8 @@ int __cpu_disable(void) struct ec_creg_mask_parms cr_parms; int cpu = smp_processor_id(); + /* Handle possible pending IPIs */ + smp_handle_ext_call(); cpu_clear(cpu, cpu_online_map); /* Disable pfault pseudo page faults on this cpu. */ @@ -640,6 +730,7 @@ void __cpu_die(unsigned int cpu) while (signal_processor_p(0, cpu, sigp_set_prefix) == sigp_busy) udelay(10); smp_free_lowcore(cpu); + atomic_dec(&init_mm.context.attach_count); pr_info("Processor %d stopped\n", cpu); } @@ -917,13 +1008,10 @@ static int __cpuinit smp_cpu_notify(stru unsigned int cpu = (unsigned int)(long)hcpu; struct cpu *c = &per_cpu(cpu_devices, cpu); struct sys_device *s = &c->sysdev; - struct s390_idle_data *idle; switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - idle = &per_cpu(s390_idle, cpu); - memset(idle, 0, sizeof(struct s390_idle_data)); if (sysfs_create_group(&s->kobj, &cpu_online_attr_group)) return NOTIFY_BAD; break; diff -uprN linux-2.6.32/arch/s390/kernel/suspend.c linux-2.6.32.ovz/arch/s390/kernel/suspend.c --- linux-2.6.32/arch/s390/kernel/suspend.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/suspend.c 2015-03-10 13:23:55.000000000 -0700 @@ -7,6 +7,7 @@ */ #include +#include #include /* @@ -14,6 +15,8 @@ */ extern const void __nosave_begin, __nosave_end; +unsigned long suspend_zero_pages; + int pfn_is_nosave(unsigned long pfn) { unsigned long nosave_begin_pfn = PFN_DOWN(__pa(&__nosave_begin)); @@ -30,6 +33,36 @@ int pfn_is_nosave(unsigned long pfn) return 0; } +/* + * PM notifier callback for suspend + */ +static int suspend_pm_cb(struct notifier_block *nb, unsigned long action, + void *ptr) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + suspend_zero_pages = __get_free_pages(GFP_KERNEL, LC_ORDER); + if (!suspend_zero_pages) + return NOTIFY_BAD; + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + free_pages(suspend_zero_pages, LC_ORDER); + break; + default: + return NOTIFY_DONE; + } + return NOTIFY_OK; +} + +static int __init suspend_pm_init(void) +{ + pm_notifier(suspend_pm_cb, 0); + return 0; +} +arch_initcall(suspend_pm_init); + void save_processor_state(void) { /* swsusp_arch_suspend() actually saves all cpu register contents. diff -uprN linux-2.6.32/arch/s390/kernel/swsusp_asm64.S linux-2.6.32.ovz/arch/s390/kernel/swsusp_asm64.S --- linux-2.6.32/arch/s390/kernel/swsusp_asm64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/swsusp_asm64.S 2015-03-10 13:23:55.000000000 -0700 @@ -36,8 +36,8 @@ swsusp_arch_suspend: /* Store prefix register on stack */ stpx __SF_EMPTY(%r15) - /* Save prefix register contents for lowcore */ - llgf %r4,__SF_EMPTY(%r15) + /* Save prefix register contents for lowcore copy */ + llgf %r10,__SF_EMPTY(%r15) /* Get pointer to save area */ lghi %r1,0x1000 @@ -91,7 +91,18 @@ swsusp_arch_suspend: xc __SF_EMPTY(4,%r15),__SF_EMPTY(%r15) spx __SF_EMPTY(%r15) + /* Save absolute zero pages */ + larl %r2,suspend_zero_pages + lg %r2,0(%r2) + lghi %r4,0 + lghi %r3,2*PAGE_SIZE + lghi %r5,2*PAGE_SIZE +1: mvcle %r2,%r4,0 + jo 1b + + /* Copy lowcore to absolute zero lowcore */ lghi %r2,0 + lgr %r4,%r10 lghi %r3,2*PAGE_SIZE lghi %r5,2*PAGE_SIZE 1: mvcle %r2,%r4,0 @@ -246,8 +257,20 @@ restore_registers: /* Load old stack */ lg %r15,0x2f8(%r13) + /* Save prefix register */ + mvc __SF_EMPTY(4,%r15),0x318(%r13) + + /* Restore absolute zero pages */ + lghi %r2,0 + larl %r4,suspend_zero_pages + lg %r4,0(%r4) + lghi %r3,2*PAGE_SIZE + lghi %r5,2*PAGE_SIZE +1: mvcle %r2,%r4,0 + jo 1b + /* Restore prefix register */ - spx 0x318(%r13) + spx __SF_EMPTY(%r15) /* Activate DAT */ stosm __SF_EMPTY(%r15),0x04 @@ -256,6 +279,12 @@ restore_registers: lghi %r2,0 brasl %r14,arch_set_page_states + /* Log potential guest relocation */ + brasl %r14,lgr_info_log + + /* Reinitialize the channel subsystem */ + brasl %r14,channel_subsystem_reinit + /* Return 0 */ lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 diff -uprN linux-2.6.32/arch/s390/kernel/syscalls.S linux-2.6.32.ovz/arch/s390/kernel/syscalls.S --- linux-2.6.32/arch/s390/kernel/syscalls.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/syscalls.S 2015-03-10 13:23:54.000000000 -0700 @@ -340,3 +340,14 @@ SYSCALL(sys_preadv,sys_preadv,compat_sys SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper) SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */ SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper) +NI_SYSCALL /* fanotify_init */ +NI_SYSCALL /* fanotify_mark */ +NI_SYSCALL /* prlimit64 */ +NI_SYSCALL /* name_to_handle_at */ +NI_SYSCALL /* open_by_handle_at */ +NI_SYSCALL /* clock_adjtime */ +SYSCALL(sys_syncfs,sys_syncfs,sys_syncfs_wrapper) +SYSCALL(sys_setns,sys_setns,sys_setns_wrapper) +NI_SYSCALL /* process_vm_readv */ +NI_SYSCALL /* process_vm_writev */ +SYSCALL(sys_ni_syscall,sys_s390_runtime_instr,sys_s390_runtime_instr_wrapper) diff -uprN linux-2.6.32/arch/s390/kernel/sysinfo.c linux-2.6.32.ovz/arch/s390/kernel/sysinfo.c --- linux-2.6.32/arch/s390/kernel/sysinfo.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/sysinfo.c 2015-03-10 13:24:14.000000000 -0700 @@ -23,14 +23,14 @@ static inline int stsi_0(void) { int rc = stsi(NULL, 0, 0, 0); + return rc == -ENOSYS ? rc : (((unsigned int) rc) >> 28); } -static int stsi_1_1_1(struct sysinfo_1_1_1 *info, char *page, int len) +static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) { if (stsi(info, 1, 1, 1) == -ENOSYS) - return len; - + return; EBCASC(info->manufacturer, sizeof(info->manufacturer)); EBCASC(info->type, sizeof(info->type)); EBCASC(info->model, sizeof(info->model)); @@ -39,202 +39,159 @@ static int stsi_1_1_1(struct sysinfo_1_1 EBCASC(info->model_capacity, sizeof(info->model_capacity)); EBCASC(info->model_perm_cap, sizeof(info->model_perm_cap)); EBCASC(info->model_temp_cap, sizeof(info->model_temp_cap)); - len += sprintf(page + len, "Manufacturer: %-16.16s\n", - info->manufacturer); - len += sprintf(page + len, "Type: %-4.4s\n", - info->type); + seq_printf(m, "Manufacturer: %-16.16s\n", info->manufacturer); + seq_printf(m, "Type: %-4.4s\n", info->type); + /* + * Sigh: the model field has been renamed with System z9 + * to model_capacity and a new model field has been added + * after the plant field. To avoid confusing older programs + * the "Model:" prints "model_capacity model" or just + * "model_capacity" if the model string is empty . + */ + seq_printf(m, "Model: %-16.16s", info->model_capacity); if (info->model[0] != '\0') - /* - * Sigh: the model field has been renamed with System z9 - * to model_capacity and a new model field has been added - * after the plant field. To avoid confusing older programs - * the "Model:" prints "model_capacity model" or just - * "model_capacity" if the model string is empty . - */ - len += sprintf(page + len, - "Model: %-16.16s %-16.16s\n", - info->model_capacity, info->model); - else - len += sprintf(page + len, "Model: %-16.16s\n", - info->model_capacity); - len += sprintf(page + len, "Sequence Code: %-16.16s\n", - info->sequence); - len += sprintf(page + len, "Plant: %-4.4s\n", - info->plant); - len += sprintf(page + len, "Model Capacity: %-16.16s %08u\n", - info->model_capacity, *(u32 *) info->model_cap_rating); + seq_printf(m, " %-16.16s", info->model); + seq_putc(m, '\n'); + seq_printf(m, "Sequence Code: %-16.16s\n", info->sequence); + seq_printf(m, "Plant: %-4.4s\n", info->plant); + seq_printf(m, "Model Capacity: %-16.16s %08u\n", + info->model_capacity, *(u32 *) info->model_cap_rating); if (info->model_perm_cap[0] != '\0') - len += sprintf(page + len, - "Model Perm. Capacity: %-16.16s %08u\n", - info->model_perm_cap, - *(u32 *) info->model_perm_cap_rating); + seq_printf(m, "Model Perm. Capacity: %-16.16s %08u\n", + info->model_perm_cap, + *(u32 *) info->model_perm_cap_rating); if (info->model_temp_cap[0] != '\0') - len += sprintf(page + len, - "Model Temp. Capacity: %-16.16s %08u\n", - info->model_temp_cap, - *(u32 *) info->model_temp_cap_rating); - return len; + seq_printf(m, "Model Temp. Capacity: %-16.16s %08u\n", + info->model_temp_cap, + *(u32 *) info->model_temp_cap_rating); + if (info->cai) { + seq_printf(m, "Capacity Adj. Ind.: %d\n", info->cai); + seq_printf(m, "Capacity Ch. Reason: %d\n", info->ccr); + } } -static int stsi_1_2_2(struct sysinfo_1_2_2 *info, char *page, int len) +static void stsi_1_2_2(struct seq_file *m, struct sysinfo_1_2_2 *info) { struct sysinfo_1_2_2_extension *ext; int i; if (stsi(info, 1, 2, 2) == -ENOSYS) - return len; + return; ext = (struct sysinfo_1_2_2_extension *) ((unsigned long) info + info->acc_offset); - - len += sprintf(page + len, "\n"); - len += sprintf(page + len, "CPUs Total: %d\n", - info->cpus_total); - len += sprintf(page + len, "CPUs Configured: %d\n", - info->cpus_configured); - len += sprintf(page + len, "CPUs Standby: %d\n", - info->cpus_standby); - len += sprintf(page + len, "CPUs Reserved: %d\n", - info->cpus_reserved); - - if (info->format == 1) { - /* - * Sigh 2. According to the specification the alternate - * capability field is a 32 bit floating point number - * if the higher order 8 bits are not zero. Printing - * a floating point number in the kernel is a no-no, - * always print the number as 32 bit unsigned integer. - * The user-space needs to know about the strange - * encoding of the alternate cpu capability. - */ - len += sprintf(page + len, "Capability: %u %u\n", - info->capability, ext->alt_capability); - for (i = 2; i <= info->cpus_total; i++) - len += sprintf(page + len, - "Adjustment %02d-way: %u %u\n", - i, info->adjustment[i-2], - ext->alt_adjustment[i-2]); - - } else { - len += sprintf(page + len, "Capability: %u\n", - info->capability); - for (i = 2; i <= info->cpus_total; i++) - len += sprintf(page + len, - "Adjustment %02d-way: %u\n", - i, info->adjustment[i-2]); + seq_putc(m, '\n'); + seq_printf(m, "CPUs Total: %d\n", info->cpus_total); + seq_printf(m, "CPUs Configured: %d\n", info->cpus_configured); + seq_printf(m, "CPUs Standby: %d\n", info->cpus_standby); + seq_printf(m, "CPUs Reserved: %d\n", info->cpus_reserved); + /* + * Sigh 2. According to the specification the alternate + * capability field is a 32 bit floating point number + * if the higher order 8 bits are not zero. Printing + * a floating point number in the kernel is a no-no, + * always print the number as 32 bit unsigned integer. + * The user-space needs to know about the strange + * encoding of the alternate cpu capability. + */ + seq_printf(m, "Capability: %u", info->capability); + if (info->format == 1) + seq_printf(m, " %u", ext->alt_capability); + seq_putc(m, '\n'); + for (i = 2; i <= info->cpus_total; i++) { + seq_printf(m, "Adjustment %02d-way: %u", + i, info->adjustment[i-2]); + if (info->format == 1) + seq_printf(m, " %u", ext->alt_adjustment[i-2]); + seq_putc(m, '\n'); } - - if (info->secondary_capability != 0) - len += sprintf(page + len, "Secondary Capability: %d\n", - info->secondary_capability); - return len; + if (info->secondary_capability) + seq_printf(m, "Secondary Capability: %d\n", + info->secondary_capability); } -static int stsi_2_2_2(struct sysinfo_2_2_2 *info, char *page, int len) +static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info) { if (stsi(info, 2, 2, 2) == -ENOSYS) - return len; - + return; EBCASC(info->name, sizeof(info->name)); - - len += sprintf(page + len, "\n"); - len += sprintf(page + len, "LPAR Number: %d\n", - info->lpar_number); - - len += sprintf(page + len, "LPAR Characteristics: "); + seq_putc(m, '\n'); + seq_printf(m, "LPAR Number: %d\n", info->lpar_number); + seq_printf(m, "LPAR Characteristics: "); if (info->characteristics & LPAR_CHAR_DEDICATED) - len += sprintf(page + len, "Dedicated "); + seq_printf(m, "Dedicated "); if (info->characteristics & LPAR_CHAR_SHARED) - len += sprintf(page + len, "Shared "); + seq_printf(m, "Shared "); if (info->characteristics & LPAR_CHAR_LIMITED) - len += sprintf(page + len, "Limited "); - len += sprintf(page + len, "\n"); - - len += sprintf(page + len, "LPAR Name: %-8.8s\n", - info->name); - - len += sprintf(page + len, "LPAR Adjustment: %d\n", - info->caf); - - len += sprintf(page + len, "LPAR CPUs Total: %d\n", - info->cpus_total); - len += sprintf(page + len, "LPAR CPUs Configured: %d\n", - info->cpus_configured); - len += sprintf(page + len, "LPAR CPUs Standby: %d\n", - info->cpus_standby); - len += sprintf(page + len, "LPAR CPUs Reserved: %d\n", - info->cpus_reserved); - len += sprintf(page + len, "LPAR CPUs Dedicated: %d\n", - info->cpus_dedicated); - len += sprintf(page + len, "LPAR CPUs Shared: %d\n", - info->cpus_shared); - return len; + seq_printf(m, "Limited "); + seq_putc(m, '\n'); + seq_printf(m, "LPAR Name: %-8.8s\n", info->name); + seq_printf(m, "LPAR Adjustment: %d\n", info->caf); + seq_printf(m, "LPAR CPUs Total: %d\n", info->cpus_total); + seq_printf(m, "LPAR CPUs Configured: %d\n", info->cpus_configured); + seq_printf(m, "LPAR CPUs Standby: %d\n", info->cpus_standby); + seq_printf(m, "LPAR CPUs Reserved: %d\n", info->cpus_reserved); + seq_printf(m, "LPAR CPUs Dedicated: %d\n", info->cpus_dedicated); + seq_printf(m, "LPAR CPUs Shared: %d\n", info->cpus_shared); } -static int stsi_3_2_2(struct sysinfo_3_2_2 *info, char *page, int len) +static void stsi_3_2_2(struct seq_file *m, struct sysinfo_3_2_2 *info) { int i; if (stsi(info, 3, 2, 2) == -ENOSYS) - return len; + return; for (i = 0; i < info->count; i++) { EBCASC(info->vm[i].name, sizeof(info->vm[i].name)); EBCASC(info->vm[i].cpi, sizeof(info->vm[i].cpi)); - len += sprintf(page + len, "\n"); - len += sprintf(page + len, "VM%02d Name: %-8.8s\n", - i, info->vm[i].name); - len += sprintf(page + len, "VM%02d Control Program: %-16.16s\n", - i, info->vm[i].cpi); - - len += sprintf(page + len, "VM%02d Adjustment: %d\n", - i, info->vm[i].caf); - - len += sprintf(page + len, "VM%02d CPUs Total: %d\n", - i, info->vm[i].cpus_total); - len += sprintf(page + len, "VM%02d CPUs Configured: %d\n", - i, info->vm[i].cpus_configured); - len += sprintf(page + len, "VM%02d CPUs Standby: %d\n", - i, info->vm[i].cpus_standby); - len += sprintf(page + len, "VM%02d CPUs Reserved: %d\n", - i, info->vm[i].cpus_reserved); + seq_putc(m, '\n'); + seq_printf(m, "VM%02d Name: %-8.8s\n", i, info->vm[i].name); + seq_printf(m, "VM%02d Control Program: %-16.16s\n", i, info->vm[i].cpi); + seq_printf(m, "VM%02d Adjustment: %d\n", i, info->vm[i].caf); + seq_printf(m, "VM%02d CPUs Total: %d\n", i, info->vm[i].cpus_total); + seq_printf(m, "VM%02d CPUs Configured: %d\n", i, info->vm[i].cpus_configured); + seq_printf(m, "VM%02d CPUs Standby: %d\n", i, info->vm[i].cpus_standby); + seq_printf(m, "VM%02d CPUs Reserved: %d\n", i, info->vm[i].cpus_reserved); } - return len; } -static int proc_read_sysinfo(char *page, char **start, - off_t off, int count, - int *eof, void *data) +static int sysinfo_show(struct seq_file *m, void *v) { - unsigned long info = get_zeroed_page(GFP_KERNEL); - int level, len; + void *info = (void *)get_zeroed_page(GFP_KERNEL); + int level; if (!info) return 0; - - len = 0; level = stsi_0(); if (level >= 1) - len = stsi_1_1_1((struct sysinfo_1_1_1 *) info, page, len); - + stsi_1_1_1(m, info); if (level >= 1) - len = stsi_1_2_2((struct sysinfo_1_2_2 *) info, page, len); - + stsi_1_2_2(m, info); if (level >= 2) - len = stsi_2_2_2((struct sysinfo_2_2_2 *) info, page, len); - + stsi_2_2_2(m, info); if (level >= 3) - len = stsi_3_2_2((struct sysinfo_3_2_2 *) info, page, len); + stsi_3_2_2(m, info); + free_page((unsigned long)info); + return 0; +} - free_page(info); - return len; +static int sysinfo_open(struct inode *inode, struct file *file) +{ + return single_open(file, sysinfo_show, NULL); } -static __init int create_proc_sysinfo(void) +static const struct file_operations sysinfo_fops = { + .open = sysinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init sysinfo_create_proc(void) { - create_proc_read_entry("sysinfo", 0444, NULL, - proc_read_sysinfo, NULL); + proc_create("sysinfo", 0444, NULL, &sysinfo_fops); return 0; } -device_initcall(create_proc_sysinfo); +device_initcall(sysinfo_create_proc); /* * Service levels interface. diff -uprN linux-2.6.32/arch/s390/kernel/sys_s390.c linux-2.6.32.ovz/arch/s390/kernel/sys_s390.c --- linux-2.6.32/arch/s390/kernel/sys_s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/sys_s390.c 2015-03-10 13:21:11.000000000 -0700 @@ -32,32 +32,6 @@ #include #include "entry.h" -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux for S/390 isn't able to handle more than 5 @@ -81,7 +55,7 @@ SYSCALL_DEFINE1(mmap2, struct mmap_arg_s if (copy_from_user(&a, arg, sizeof(a))) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); out: return error; } @@ -98,7 +72,7 @@ SYSCALL_DEFINE1(s390_old_mmap, struct mm if (a.offset & ~PAGE_MASK) goto out; - error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return error; } diff -uprN linux-2.6.32/arch/s390/kernel/time.c linux-2.6.32.ovz/arch/s390/kernel/time.c --- linux-2.6.32/arch/s390/kernel/time.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/time.c 2015-03-10 13:23:55.000000000 -0700 @@ -69,7 +69,7 @@ static DEFINE_PER_CPU(struct clock_event */ unsigned long long notrace sched_clock(void) { - return (get_clock_monotonic() * 125) >> 9; + return tod_to_ns(get_clock_monotonic()); } /* @@ -81,15 +81,15 @@ unsigned long long monotonic_clock(void) } EXPORT_SYMBOL(monotonic_clock); -void tod_to_timeval(__u64 todval, struct timespec *xtime) +void tod_to_timeval(__u64 todval, struct timespec *xt) { unsigned long long sec; sec = todval >> 12; do_div(sec, 1000000); - xtime->tv_sec = sec; + xt->tv_sec = sec; todval -= (sec * 1000000) << 12; - xtime->tv_nsec = ((todval * 1000) >> 12); + xt->tv_nsec = ((todval * 1000) >> 12); } EXPORT_SYMBOL(tod_to_timeval); @@ -214,7 +214,8 @@ struct clocksource * __init clocksource_ return &clocksource_tod; } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { if (clock != &clocksource_tod) return; @@ -223,10 +224,11 @@ void update_vsyscall(struct timespec *wa ++vdso_data->tb_update_count; smp_wmb(); vdso_data->xtime_tod_stamp = clock->cycle_last; - vdso_data->xtime_clock_sec = xtime.tv_sec; - vdso_data->xtime_clock_nsec = xtime.tv_nsec; - vdso_data->wtom_clock_sec = wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = wall_to_monotonic.tv_nsec; + vdso_data->xtime_clock_sec = wall_time->tv_sec; + vdso_data->xtime_clock_nsec = wall_time->tv_nsec; + vdso_data->wtom_clock_sec = wtm->tv_sec; + vdso_data->wtom_clock_nsec = wtm->tv_nsec; + vdso_data->ntp_mult = mult; smp_wmb(); ++vdso_data->tb_update_count; } @@ -396,7 +398,6 @@ static void __init time_init_wq(void) if (time_sync_wq) return; time_sync_wq = create_singlethread_workqueue("timesync"); - stop_machine_create(); } /* @@ -530,8 +531,11 @@ void etr_switch_to_local(void) if (!etr_eacr.sl) return; disable_sync_clock(NULL); - set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events); - queue_work(time_sync_wq, &etr_work); + if (!test_and_set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events)) { + etr_eacr.es = etr_eacr.sl = 0; + etr_setr(&etr_eacr); + queue_work(time_sync_wq, &etr_work); + } } /* @@ -545,8 +549,11 @@ void etr_sync_check(void) if (!etr_eacr.es) return; disable_sync_clock(NULL); - set_bit(ETR_EVENT_SYNC_CHECK, &etr_events); - queue_work(time_sync_wq, &etr_work); + if (!test_and_set_bit(ETR_EVENT_SYNC_CHECK, &etr_events)) { + etr_eacr.es = 0; + etr_setr(&etr_eacr); + queue_work(time_sync_wq, &etr_work); + } } /* @@ -908,7 +915,7 @@ static struct etr_eacr etr_handle_update * Do not try to get the alternate port aib if the clock * is not in sync yet. */ - if (!check_sync_clock()) + if (!eacr.es || !check_sync_clock()) return eacr; /* @@ -1070,7 +1077,7 @@ static void etr_work_fn(struct work_stru * If the clock is in sync just update the eacr and return. * If there is no valid sync port wait for a port update. */ - if (check_sync_clock() || sync_port < 0) { + if ((eacr.es && check_sync_clock()) || sync_port < 0) { etr_update_eacr(eacr); etr_set_tolec_timeout(now); goto out_unlock; diff -uprN linux-2.6.32/arch/s390/kernel/topology.c linux-2.6.32.ovz/arch/s390/kernel/topology.c --- linux-2.6.32/arch/s390/kernel/topology.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/topology.c 2015-03-10 13:22:07.000000000 -0700 @@ -37,7 +37,8 @@ struct tl_cpu { }; struct tl_container { - unsigned char reserved[8]; + unsigned char reserved[7]; + unsigned char id; }; union tl_entry { @@ -56,15 +57,15 @@ struct tl_info { union tl_entry tle[0]; }; -struct core_info { - struct core_info *next; +struct mask_info { + struct mask_info *next; + unsigned char id; cpumask_t mask; }; -static int topology_enabled; +static int topology_enabled = 1; static void topology_work_fn(struct work_struct *work); static struct tl_info *tl_info; -static struct core_info core_info; static int machine_has_topology; static struct timer_list topology_timer; static void set_topology_timer(void); @@ -72,37 +73,37 @@ static DECLARE_WORK(topology_work, topol /* topology_lock protects the core linked list */ static DEFINE_SPINLOCK(topology_lock); +static struct mask_info core_info; cpumask_t cpu_core_map[NR_CPUS]; +unsigned char cpu_core_id[NR_CPUS]; -static cpumask_t cpu_coregroup_map(unsigned int cpu) +#ifdef CONFIG_SCHED_BOOK +static struct mask_info book_info; +cpumask_t cpu_book_map[NR_CPUS]; +unsigned char cpu_book_id[NR_CPUS]; +#endif + +static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) { - struct core_info *core = &core_info; - unsigned long flags; cpumask_t mask; cpus_clear(mask); if (!topology_enabled || !machine_has_topology) - return cpu_possible_map; - spin_lock_irqsave(&topology_lock, flags); - while (core) { - if (cpu_isset(cpu, core->mask)) { - mask = core->mask; + return cpumask_of_cpu(cpu); + while (info) { + if (cpu_isset(cpu, info->mask)) { + mask = info->mask; break; } - core = core->next; + info = info->next; } - spin_unlock_irqrestore(&topology_lock, flags); if (cpus_empty(mask)) mask = cpumask_of_cpu(cpu); return mask; } -const struct cpumask *cpu_coregroup_mask(unsigned int cpu) -{ - return &cpu_core_map[cpu]; -} - -static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core) +static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book, + struct mask_info *core) { unsigned int cpu; @@ -114,22 +115,35 @@ static void add_cpus_to_core(struct tl_c rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin; for_each_present_cpu(lcpu) { - if (__cpu_logical_map[lcpu] == rcpu) { - cpu_set(lcpu, core->mask); - smp_cpu_polarization[lcpu] = tl_cpu->pp; - } + if (cpu_logical_map(lcpu) != rcpu) + continue; +#ifdef CONFIG_SCHED_BOOK + cpu_set(lcpu, book->mask); + cpu_book_id[lcpu] = book->id; +#endif + cpu_set(lcpu, core->mask); + cpu_core_id[lcpu] = core->id; + smp_cpu_polarization[lcpu] = tl_cpu->pp; } } } -static void clear_cores(void) +static void clear_masks(void) { - struct core_info *core = &core_info; + struct mask_info *info; - while (core) { - cpus_clear(core->mask); - core = core->next; + info = &core_info; + while (info) { + cpus_clear(info->mask); + info = info->next; } +#ifdef CONFIG_SCHED_BOOK + info = &book_info; + while (info) { + cpus_clear(info->mask); + info = info->next; + } +#endif } static union tl_entry *next_tle(union tl_entry *tle) @@ -142,33 +156,42 @@ static union tl_entry *next_tle(union tl static void tl_to_cores(struct tl_info *info) { +#ifdef CONFIG_SCHED_BOOK + struct mask_info *book = &book_info; +#else + struct mask_info *book = NULL; +#endif + struct mask_info *core = &core_info; union tl_entry *tle, *end; - struct core_info *core = &core_info; + spin_lock_irq(&topology_lock); - clear_cores(); + clear_masks(); tle = info->tle; end = (union tl_entry *)((unsigned long)info + info->length); while (tle < end) { switch (tle->nl) { - case 5: - case 4: - case 3: +#ifdef CONFIG_SCHED_BOOK case 2: + book = book->next; + book->id = tle->container.id; break; +#endif case 1: core = core->next; + core->id = tle->container.id; break; case 0: - add_cpus_to_core(&tle->cpu, core); + add_cpus_to_mask(&tle->cpu, book, core); break; default: - clear_cores(); + clear_masks(); machine_has_topology = 0; - return; + goto out; } tle = next_tle(tle); } +out: spin_unlock_irq(&topology_lock); } @@ -215,10 +238,29 @@ int topology_set_cpu_management(int fc) static void update_cpu_core_map(void) { + unsigned long flags; int cpu; - for_each_possible_cpu(cpu) - cpu_core_map[cpu] = cpu_coregroup_map(cpu); + spin_lock_irqsave(&topology_lock, flags); + for_each_possible_cpu(cpu) { + cpu_core_map[cpu] = cpu_group_map(&core_info, cpu); +#ifdef CONFIG_SCHED_BOOK + cpu_book_map[cpu] = cpu_group_map(&book_info, cpu); +#endif + } + spin_unlock_irqrestore(&topology_lock, flags); +} + +static void store_topology(struct tl_info *info) +{ +#ifdef CONFIG_SCHED_BOOK + int rc; + + rc = stsi(info, 15, 1, 3); + if (rc != -ENOSYS) + return; +#endif + stsi(info, 15, 1, 2); } int arch_update_cpu_topology(void) @@ -232,7 +274,7 @@ int arch_update_cpu_topology(void) topology_update_polarization_simple(); return 0; } - stsi(info, 15, 1, 2); + store_topology(info); tl_to_cores(info); update_cpu_core_map(); for_each_online_cpu(cpu) { @@ -269,9 +311,9 @@ static void set_topology_timer(void) static int __init early_parse_topology(char *p) { - if (strncmp(p, "on", 2)) + if (strncmp(p, "off", 3)) return 0; - topology_enabled = 1; + topology_enabled = 0; return 0; } early_param("topology", early_parse_topology); @@ -293,12 +335,24 @@ out: } __initcall(init_topology_update); +static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset) +{ + int i, nr_masks; + + nr_masks = info->mag[NR_MAG - offset]; + for (i = 0; i < info->mnest - offset; i++) + nr_masks *= info->mag[NR_MAG - offset - 1 - i]; + nr_masks = max(nr_masks, 1); + for (i = 0; i < nr_masks; i++) { + mask->next = alloc_bootmem(sizeof(struct mask_info)); + mask = mask->next; + } +} + void __init s390_init_cpu_topology(void) { unsigned long long facility_bits; struct tl_info *info; - struct core_info *core; - int nr_cores; int i; if (stfle(&facility_bits, 1) <= 0) @@ -309,25 +363,13 @@ void __init s390_init_cpu_topology(void) tl_info = alloc_bootmem_pages(PAGE_SIZE); info = tl_info; - stsi(info, 15, 1, 2); - - nr_cores = info->mag[NR_MAG - 2]; - for (i = 0; i < info->mnest - 2; i++) - nr_cores *= info->mag[NR_MAG - 3 - i]; - + store_topology(info); pr_info("The CPU configuration topology of the machine is:"); for (i = 0; i < NR_MAG; i++) printk(" %d", info->mag[i]); printk(" / %d\n", info->mnest); - - core = &core_info; - for (i = 0; i < nr_cores; i++) { - core->next = alloc_bootmem(sizeof(struct core_info)); - core = core->next; - if (!core) - goto error; - } - return; -error: - machine_has_topology = 0; + alloc_masks(info, &core_info, 2); +#ifdef CONFIG_SCHED_BOOK + alloc_masks(info, &book_info, 3); +#endif } diff -uprN linux-2.6.32/arch/s390/kernel/traps.c linux-2.6.32.ovz/arch/s390/kernel/traps.c --- linux-2.6.32/arch/s390/kernel/traps.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/traps.c 2015-03-10 13:24:05.000000000 -0700 @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -42,6 +42,7 @@ #include #include #include +#include #include "entry.h" pgm_check_handler_t *pgm_check_table[128]; @@ -56,7 +57,6 @@ int sysctl_userprocess_debug = 0; extern pgm_check_handler_t do_protection_exception; extern pgm_check_handler_t do_dat_exception; -extern pgm_check_handler_t do_asce_exception; #define stack_pointer ({ void **sp; asm("la %0,0(15)" : "=&d" (sp)); sp; }) @@ -70,6 +70,23 @@ static int kstack_depth_to_print = 12; static int kstack_depth_to_print = 20; #endif /* CONFIG_64BIT */ +static inline void __user *get_trap_ip(struct pt_regs *regs, long int_code) +{ +#ifdef CONFIG_64BIT + unsigned long address; + + if (int_code & 0x200) + address = *(unsigned long *)(current->thread.trap_tdb + 24); + else + address = regs->psw.addr; + return (void __user *) + ((address - (int_code >> 16)) & PSW_ADDR_INSN); +#else + return (void __user *) + ((regs->psw.addr - (int_code >> 16)) & PSW_ADDR_INSN); +#endif +} + /* * For show_trace we have tree different stack to consider: * - the panic stack which is used if the kernel stack has overflown @@ -243,43 +260,6 @@ void show_regs(struct pt_regs *regs) show_last_breaking_event(regs); } -/* This is called from fs/proc/array.c */ -void task_show_regs(struct seq_file *m, struct task_struct *task) -{ - struct pt_regs *regs; - - regs = task_pt_regs(task); - seq_printf(m, "task: %p, ksp: %p\n", - task, (void *)task->thread.ksp); - seq_printf(m, "User PSW : %p %p\n", - (void *) regs->psw.mask, (void *)regs->psw.addr); - - seq_printf(m, "User GPRS: " FOURLONG, - regs->gprs[0], regs->gprs[1], - regs->gprs[2], regs->gprs[3]); - seq_printf(m, " " FOURLONG, - regs->gprs[4], regs->gprs[5], - regs->gprs[6], regs->gprs[7]); - seq_printf(m, " " FOURLONG, - regs->gprs[8], regs->gprs[9], - regs->gprs[10], regs->gprs[11]); - seq_printf(m, " " FOURLONG, - regs->gprs[12], regs->gprs[13], - regs->gprs[14], regs->gprs[15]); - seq_printf(m, "User ACRS: %08x %08x %08x %08x\n", - task->thread.acrs[0], task->thread.acrs[1], - task->thread.acrs[2], task->thread.acrs[3]); - seq_printf(m, " %08x %08x %08x %08x\n", - task->thread.acrs[4], task->thread.acrs[5], - task->thread.acrs[6], task->thread.acrs[7]); - seq_printf(m, " %08x %08x %08x %08x\n", - task->thread.acrs[8], task->thread.acrs[9], - task->thread.acrs[10], task->thread.acrs[11]); - seq_printf(m, " %08x %08x %08x %08x\n", - task->thread.acrs[12], task->thread.acrs[13], - task->thread.acrs[14], task->thread.acrs[15]); -} - static DEFINE_SPINLOCK(die_lock); void die(const char * str, struct pt_regs * regs, long err) @@ -287,6 +267,7 @@ void die(const char * str, struct pt_reg static int die_counter; oops_enter(); + lgr_info_log(); debug_stop_all(); console_verbose(); spin_lock_irq(&die_lock); @@ -334,10 +315,11 @@ int is_valid_bugaddr(unsigned long addr) return 1; } -static void __kprobes inline do_trap(long interruption_code, int signr, - char *str, struct pt_regs *regs, - siginfo_t *info) +static void __kprobes inline do_trap(struct pt_regs *regs, long int_code, + int si_signo, int si_code, char *str) { + siginfo_t info; + /* * We got all needed information from the lowcore and can * now safely switch on interrupts. @@ -345,16 +327,20 @@ static void __kprobes inline do_trap(lon if (regs->psw.mask & PSW_MASK_PSTATE) local_irq_enable(); - if (notify_die(DIE_TRAP, str, regs, interruption_code, - interruption_code, signr) == NOTIFY_STOP) + if (notify_die(DIE_TRAP, str, regs, 0, + int_code, si_signo) == NOTIFY_STOP) return; if (regs->psw.mask & PSW_MASK_PSTATE) { struct task_struct *tsk = current; - tsk->thread.trap_no = interruption_code & 0xffff; - force_sig_info(signr, info, tsk); - report_user_fault(interruption_code, regs); + tsk->thread.trap_no = int_code & 0xffff; + info.si_signo = si_signo; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = get_trap_ip(regs, int_code); + force_sig_info(si_signo, &info, tsk); + report_user_fault(int_code, regs); } else { const struct exception_table_entry *fixup; fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); @@ -366,23 +352,18 @@ static void __kprobes inline do_trap(lon btt = report_bug(regs->psw.addr & PSW_ADDR_INSN, regs); if (btt == BUG_TRAP_TYPE_WARN) return; - die(str, regs, interruption_code); + die(str, regs, int_code); } } } -static inline void __user *get_check_address(struct pt_regs *regs) -{ - return (void __user *)((regs->psw.addr-S390_lowcore.pgm_ilc) & PSW_ADDR_INSN); -} - void __kprobes do_single_step(struct pt_regs *regs) { if (notify_die(DIE_SSTEP, "sstep", regs, 0, 0, SIGTRAP) == NOTIFY_STOP){ return; } - if ((current->ptrace & PT_PTRACED) != 0) + if (tracehook_consider_fatal_signal(current, SIGTRAP)) force_sig(SIGTRAP, current); } @@ -396,81 +377,76 @@ static void default_trap_handler(struct die("Unknown program exception", regs, interruption_code); } -#define DO_ERROR_INFO(signr, str, name, sicode, siaddr) \ +#define DO_ERROR_INFO(name, signr, sicode, str) \ static void name(struct pt_regs * regs, long interruption_code) \ { \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = siaddr; \ - do_trap(interruption_code, signr, str, regs, &info); \ -} - -DO_ERROR_INFO(SIGILL, "addressing exception", addressing_exception, - ILL_ILLADR, get_check_address(regs)) -DO_ERROR_INFO(SIGILL, "execute exception", execute_exception, - ILL_ILLOPN, get_check_address(regs)) -DO_ERROR_INFO(SIGFPE, "fixpoint divide exception", divide_exception, - FPE_INTDIV, get_check_address(regs)) -DO_ERROR_INFO(SIGFPE, "fixpoint overflow exception", overflow_exception, - FPE_INTOVF, get_check_address(regs)) -DO_ERROR_INFO(SIGFPE, "HFP overflow exception", hfp_overflow_exception, - FPE_FLTOVF, get_check_address(regs)) -DO_ERROR_INFO(SIGFPE, "HFP underflow exception", hfp_underflow_exception, - FPE_FLTUND, get_check_address(regs)) -DO_ERROR_INFO(SIGFPE, "HFP significance exception", hfp_significance_exception, - FPE_FLTRES, get_check_address(regs)) -DO_ERROR_INFO(SIGFPE, "HFP divide exception", hfp_divide_exception, - FPE_FLTDIV, get_check_address(regs)) -DO_ERROR_INFO(SIGFPE, "HFP square root exception", hfp_sqrt_exception, - FPE_FLTINV, get_check_address(regs)) -DO_ERROR_INFO(SIGILL, "operand exception", operand_exception, - ILL_ILLOPN, get_check_address(regs)) -DO_ERROR_INFO(SIGILL, "privileged operation", privileged_op, - ILL_PRVOPC, get_check_address(regs)) -DO_ERROR_INFO(SIGILL, "special operation exception", special_op_exception, - ILL_ILLOPN, get_check_address(regs)) -DO_ERROR_INFO(SIGILL, "translation exception", translation_exception, - ILL_ILLOPN, get_check_address(regs)) + do_trap(regs, interruption_code, signr, sicode, str); \ +} + +DO_ERROR_INFO(addressing_exception, SIGILL, ILL_ILLADR, + "addressing exception") +DO_ERROR_INFO(execute_exception, SIGILL, ILL_ILLOPN, + "execute exception") +DO_ERROR_INFO(divide_exception, SIGFPE, FPE_INTDIV, + "fixpoint divide exception") +DO_ERROR_INFO(overflow_exception, SIGFPE, FPE_INTOVF, + "fixpoint overflow exception") +DO_ERROR_INFO(hfp_overflow_exception, SIGFPE, FPE_FLTOVF, + "HFP overflow exception") +DO_ERROR_INFO(hfp_underflow_exception, SIGFPE, FPE_FLTUND, + "HFP underflow exception") +DO_ERROR_INFO(hfp_significance_exception, SIGFPE, FPE_FLTRES, + "HFP significance exception") +DO_ERROR_INFO(hfp_divide_exception, SIGFPE, FPE_FLTDIV, + "HFP divide exception") +DO_ERROR_INFO(hfp_sqrt_exception, SIGFPE, FPE_FLTINV, + "HFP square root exception") +DO_ERROR_INFO(operand_exception, SIGILL, ILL_ILLOPN, + "operand exception") +DO_ERROR_INFO(privileged_op, SIGILL, ILL_PRVOPC, + "privileged operation") +DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, + "special operation exception") +DO_ERROR_INFO(translation_exception, SIGILL, ILL_ILLOPN, + "translation exception") + +#ifdef CONFIG_64BIT +DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, + "transaction constraint exception") +#endif static inline void do_fp_trap(struct pt_regs *regs, void __user *location, int fpc, long interruption_code) { - siginfo_t si; + int si_code = 0; - si.si_signo = SIGFPE; - si.si_errno = 0; - si.si_addr = location; - si.si_code = 0; /* FPC[2] is Data Exception Code */ if ((fpc & 0x00000300) == 0) { /* bits 6 and 7 of DXC are 0 iff IEEE exception */ if (fpc & 0x8000) /* invalid fp operation */ - si.si_code = FPE_FLTINV; + si_code = FPE_FLTINV; else if (fpc & 0x4000) /* div by 0 */ - si.si_code = FPE_FLTDIV; + si_code = FPE_FLTDIV; else if (fpc & 0x2000) /* overflow */ - si.si_code = FPE_FLTOVF; + si_code = FPE_FLTOVF; else if (fpc & 0x1000) /* underflow */ - si.si_code = FPE_FLTUND; + si_code = FPE_FLTUND; else if (fpc & 0x0800) /* inexact */ - si.si_code = FPE_FLTRES; + si_code = FPE_FLTRES; } current->thread.ieee_instruction_pointer = (addr_t) location; - do_trap(interruption_code, SIGFPE, - "floating point exception", regs, &si); + do_trap(regs, interruption_code, SIGFPE, si_code, + "floating point exception"); } static void illegal_op(struct pt_regs * regs, long interruption_code) { - siginfo_t info; __u8 opcode[6]; __u16 __user *location; int signal = 0; - location = get_check_address(regs); + location = get_trap_ip(regs, interruption_code); /* * We got all needed information from the lowcore and can @@ -483,7 +459,7 @@ static void illegal_op(struct pt_regs * if (get_user(*((__u16 *) opcode), (__u16 __user *) location)) return; if (*((__u16 *) opcode) == S390_BREAKPOINT_U16) { - if (current->ptrace & PT_PTRACED) + if (tracehook_consider_fatal_signal(current, SIGTRAP)) force_sig(SIGTRAP, current); else signal = SIGILL; @@ -526,23 +502,14 @@ static void illegal_op(struct pt_regs * if (signal == SIGFPE) do_fp_trap(regs, location, current->thread.fp_regs.fpc, interruption_code); - else if (signal == SIGSEGV) { - info.si_signo = signal; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = (void __user *) location; - do_trap(interruption_code, signal, - "user address fault", regs, &info); - } else + else if (signal == SIGSEGV) + do_trap(regs, interruption_code, signal, SEGV_MAPERR, + "user address fault"); + else #endif - if (signal) { - info.si_signo = signal; - info.si_errno = 0; - info.si_code = ILL_ILLOPC; - info.si_addr = (void __user *) location; - do_trap(interruption_code, signal, - "illegal operation", regs, &info); - } + if (signal) + do_trap(regs, interruption_code, signal, ILL_ILLOPC, + "illegal operation"); } @@ -554,7 +521,7 @@ specification_exception(struct pt_regs * __u16 __user *location = NULL; int signal = 0; - location = (__u16 __user *) get_check_address(regs); + location = (__u16 __user *) get_trap_ip(regs, interruption_code); /* * We got all needed information from the lowcore and can @@ -598,19 +565,13 @@ specification_exception(struct pt_regs * if (signal == SIGFPE) do_fp_trap(regs, location, current->thread.fp_regs.fpc, interruption_code); - else if (signal) { - siginfo_t info; - info.si_signo = signal; - info.si_errno = 0; - info.si_code = ILL_ILLOPN; - info.si_addr = location; - do_trap(interruption_code, signal, - "specification exception", regs, &info); - } + else if (signal) + do_trap(regs, interruption_code, signal, ILL_ILLOPN, + "specification exception"); } #else -DO_ERROR_INFO(SIGILL, "specification exception", specification_exception, - ILL_ILLOPN, get_check_address(regs)); +DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, + "specification exception"); #endif static void data_exception(struct pt_regs * regs, long interruption_code) @@ -618,7 +579,7 @@ static void data_exception(struct pt_reg __u16 __user *location; int signal = 0; - location = get_check_address(regs); + location = get_trap_ip(regs, interruption_code); /* * We got all needed information from the lowcore and can @@ -692,30 +653,18 @@ static void data_exception(struct pt_reg if (signal == SIGFPE) do_fp_trap(regs, location, current->thread.fp_regs.fpc, interruption_code); - else if (signal) { - siginfo_t info; - info.si_signo = signal; - info.si_errno = 0; - info.si_code = ILL_ILLOPN; - info.si_addr = location; - do_trap(interruption_code, signal, - "data exception", regs, &info); - } + else if (signal) + do_trap(regs, interruption_code, signal, ILL_ILLOPN, + "data exception"); } static void space_switch_exception(struct pt_regs * regs, long int_code) { - siginfo_t info; - /* Set user psw back to home space mode. */ if (regs->psw.mask & PSW_MASK_PSTATE) regs->psw.mask |= PSW_ASC_HOME; /* Send SIGILL. */ - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_PRVOPC; - info.si_addr = get_check_address(regs); - do_trap(int_code, SIGILL, "space switch event", regs, &info); + do_trap(regs, int_code, SIGILL, ILL_PRVOPC, "space switch event"); } asmlinkage void kernel_stack_overflow(struct pt_regs * regs) @@ -755,7 +704,8 @@ void __init trap_init(void) pgm_check_table[0x12] = &translation_exception; pgm_check_table[0x13] = &special_op_exception; #ifdef CONFIG_64BIT - pgm_check_table[0x38] = &do_asce_exception; + pgm_check_table[0x18] = &transaction_exception; + pgm_check_table[0x38] = &do_dat_exception; pgm_check_table[0x39] = &do_dat_exception; pgm_check_table[0x3A] = &do_dat_exception; pgm_check_table[0x3B] = &do_dat_exception; diff -uprN linux-2.6.32/arch/s390/kernel/vdso32/clock_gettime.S linux-2.6.32.ovz/arch/s390/kernel/vdso32/clock_gettime.S --- linux-2.6.32/arch/s390/kernel/vdso32/clock_gettime.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/vdso32/clock_gettime.S 2015-03-10 13:21:22.000000000 -0700 @@ -38,13 +38,13 @@ __kernel_clock_gettime: sl %r1,__VDSO_XTIME_STAMP+4(%r5) brc 3,2f ahi %r0,-1 -2: mhi %r0,1000 /* cyc2ns(clock,cycle_delta) */ +2: ms %r0,__VDSO_NTP_MULT(%r5) /* cyc2ns(clock,cycle_delta) */ lr %r2,%r0 - lhi %r0,1000 + l %r0,__VDSO_NTP_MULT(%r5) ltr %r1,%r1 mr %r0,%r0 jnm 3f - ahi %r0,1000 + a %r0,__VDSO_NTP_MULT(%r5) 3: alr %r0,%r2 srdl %r0,12 al %r0,__VDSO_XTIME_NSEC(%r5) /* + xtime */ @@ -86,13 +86,13 @@ __kernel_clock_gettime: sl %r1,__VDSO_XTIME_STAMP+4(%r5) brc 3,12f ahi %r0,-1 -12: mhi %r0,1000 /* cyc2ns(clock,cycle_delta) */ +12: ms %r0,__VDSO_NTP_MULT(%r5) /* cyc2ns(clock,cycle_delta) */ lr %r2,%r0 - lhi %r0,1000 + l %r0,__VDSO_NTP_MULT(%r5) ltr %r1,%r1 mr %r0,%r0 jnm 13f - ahi %r0,1000 + a %r0,__VDSO_NTP_MULT(%r5) 13: alr %r0,%r2 srdl %r0,12 al %r0,__VDSO_XTIME_NSEC(%r5) /* + xtime */ diff -uprN linux-2.6.32/arch/s390/kernel/vdso32/gettimeofday.S linux-2.6.32.ovz/arch/s390/kernel/vdso32/gettimeofday.S --- linux-2.6.32/arch/s390/kernel/vdso32/gettimeofday.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/vdso32/gettimeofday.S 2015-03-10 13:21:22.000000000 -0700 @@ -35,13 +35,13 @@ __kernel_gettimeofday: sl %r1,__VDSO_XTIME_STAMP+4(%r5) brc 3,3f ahi %r0,-1 -3: mhi %r0,1000 /* cyc2ns(clock,cycle_delta) */ +3: ms %r0,__VDSO_NTP_MULT(%r5) /* cyc2ns(clock,cycle_delta) */ st %r0,24(%r15) - lhi %r0,1000 + l %r0,__VDSO_NTP_MULT(%r5) ltr %r1,%r1 mr %r0,%r0 jnm 4f - ahi %r0,1000 + a %r0,__VDSO_NTP_MULT(%r5) 4: al %r0,24(%r15) srdl %r0,12 al %r0,__VDSO_XTIME_NSEC(%r5) /* + xtime */ diff -uprN linux-2.6.32/arch/s390/kernel/vdso64/clock_gettime.S linux-2.6.32.ovz/arch/s390/kernel/vdso64/clock_gettime.S --- linux-2.6.32/arch/s390/kernel/vdso64/clock_gettime.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/vdso64/clock_gettime.S 2015-03-10 13:21:22.000000000 -0700 @@ -36,7 +36,7 @@ __kernel_clock_gettime: stck 48(%r15) /* Store TOD clock */ lg %r1,48(%r15) sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - mghi %r1,1000 + msgf %r1,__VDSO_NTP_MULT(%r5) /* * NTP adjustment */ srlg %r1,%r1,12 /* cyc2ns(clock,cycle_delta) */ alg %r1,__VDSO_XTIME_NSEC(%r5) /* + xtime */ lg %r0,__VDSO_XTIME_SEC(%r5) @@ -64,7 +64,7 @@ __kernel_clock_gettime: stck 48(%r15) /* Store TOD clock */ lg %r1,48(%r15) sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - mghi %r1,1000 + msgf %r1,__VDSO_NTP_MULT(%r5) /* * NTP adjustment */ srlg %r1,%r1,12 /* cyc2ns(clock,cycle_delta) */ alg %r1,__VDSO_XTIME_NSEC(%r5) /* + xtime */ lg %r0,__VDSO_XTIME_SEC(%r5) diff -uprN linux-2.6.32/arch/s390/kernel/vdso64/gettimeofday.S linux-2.6.32.ovz/arch/s390/kernel/vdso64/gettimeofday.S --- linux-2.6.32/arch/s390/kernel/vdso64/gettimeofday.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/vdso64/gettimeofday.S 2015-03-10 13:21:22.000000000 -0700 @@ -31,7 +31,7 @@ __kernel_gettimeofday: stck 48(%r15) /* Store TOD clock */ lg %r1,48(%r15) sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ - mghi %r1,1000 + msgf %r1,__VDSO_NTP_MULT(%r5) /* * NTP adjustment */ srlg %r1,%r1,12 /* cyc2ns(clock,cycle_delta) */ alg %r1,__VDSO_XTIME_NSEC(%r5) /* + xtime.tv_nsec */ lg %r0,__VDSO_XTIME_SEC(%r5) /* xtime.tv_sec */ diff -uprN linux-2.6.32/arch/s390/kernel/vdso.c linux-2.6.32.ovz/arch/s390/kernel/vdso.c --- linux-2.6.32/arch/s390/kernel/vdso.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/vdso.c 2015-03-10 13:23:27.000000000 -0700 @@ -86,7 +86,8 @@ static void vdso_init_data(struct vdso_d unsigned int facility_list; facility_list = stfl(); - vd->ectg_available = switch_amode && (facility_list & 1); + vd->ectg_available = + user_mode != HOME_SPACE_MODE && (facility_list & 1); } #ifdef CONFIG_64BIT @@ -114,7 +115,7 @@ int vdso_alloc_per_cpu(int cpu, struct _ lowcore->vdso_per_cpu_data = __LC_PASTE; - if (!switch_amode || !vdso_enabled) + if (user_mode == HOME_SPACE_MODE || !vdso_enabled) return 0; segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); @@ -160,7 +161,7 @@ void vdso_free_per_cpu(int cpu, struct _ unsigned long segment_table, page_table, page_frame; u32 *psal, *aste; - if (!switch_amode || !vdso_enabled) + if (user_mode == HOME_SPACE_MODE || !vdso_enabled) return; psal = (u32 *)(addr_t) lowcore->paste[4]; @@ -184,7 +185,7 @@ static void __vdso_init_cr5(void *dummy) static void vdso_init_cr5(void) { - if (switch_amode && vdso_enabled) + if (user_mode != HOME_SPACE_MODE && vdso_enabled) on_each_cpu(__vdso_init_cr5, NULL, 1); } #endif /* CONFIG_64BIT */ @@ -209,7 +210,6 @@ int arch_setup_additional_pages(struct l if (!uses_interp) return 0; - vdso_base = mm->mmap_base; #ifdef CONFIG_64BIT vdso_pagelist = vdso64_pagelist; vdso_pages = vdso64_pages; @@ -239,8 +239,7 @@ int arch_setup_additional_pages(struct l * fail and end up putting it elsewhere. */ down_write(&mm->mmap_sem); - vdso_base = get_unmapped_area(NULL, vdso_base, - vdso_pages << PAGE_SHIFT, 0, 0); + vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0); if (IS_ERR_VALUE(vdso_base)) { rc = vdso_base; goto out_up; @@ -261,17 +260,11 @@ int arch_setup_additional_pages(struct l * on the "data" page of the vDSO or you'll stop getting kernel * updates and your nice userland gettimeofday will be totally dead. * It's fine to use that for setting breakpoints in the vDSO code - * pages though - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. + * pages though. */ rc = install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pagelist); if (rc) current->mm->context.vdso_base = 0; @@ -345,17 +338,17 @@ static int __init vdso_init(void) } arch_initcall(vdso_init); -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { return 0; } -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } diff -uprN linux-2.6.32/arch/s390/kernel/vtime.c linux-2.6.32.ovz/arch/s390/kernel/vtime.c --- linux-2.6.32/arch/s390/kernel/vtime.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kernel/vtime.c 2015-03-10 13:22:07.000000000 -0700 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -167,6 +168,8 @@ void vtime_stop_cpu(void) /* Wait for external, I/O or machine check interrupt. */ psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT; + idle->nohz_delay = 0; + /* Check if the CPU timer needs to be reprogrammed. */ if (vq->do_spt) { __u64 vmax = VTIMER_MAX_SLICE; @@ -560,6 +563,23 @@ void init_cpu_vtimer(void) __ctl_set_bit(0,10); } +static int __cpuinit s390_nohz_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct s390_idle_data *idle; + long cpu = (long) hcpu; + + idle = &per_cpu(s390_idle, cpu); + switch (action) { + case CPU_DYING: + case CPU_DYING_FROZEN: + idle->nohz_delay = 0; + default: + break; + } + return NOTIFY_OK; +} + void __init vtime_init(void) { /* request the cpu timer external interrupt */ @@ -568,5 +588,6 @@ void __init vtime_init(void) /* Enable cpu timer interrupts on the boot cpu. */ init_cpu_vtimer(); + cpu_notifier(s390_nohz_notify, 0); } diff -uprN linux-2.6.32/arch/s390/kvm/diag.c linux-2.6.32.ovz/arch/s390/kvm/diag.c --- linux-2.6.32/arch/s390/kvm/diag.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kvm/diag.c 2015-03-10 13:24:00.000000000 -0700 @@ -25,6 +25,29 @@ static int __diag_time_slice_end(struct return 0; } +static int __diag_time_slice_end_directed(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_vcpu *tcpu; + int tid; + int i; + + tid = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; + vcpu->stat.diagnose_9c++; + VCPU_EVENT(vcpu, 5, "diag time slice end directed to %d", tid); + + if (tid == vcpu->vcpu_id) + return 0; + + kvm_for_each_vcpu(i, tcpu, kvm) + if (tcpu->vcpu_id == tid) { + kvm_vcpu_yield_to(tcpu); + break; + } + + return 0; +} + static int __diag_ipl_functions(struct kvm_vcpu *vcpu) { unsigned int reg = vcpu->arch.sie_block->ipa & 0xf; @@ -59,6 +82,8 @@ int kvm_s390_handle_diag(struct kvm_vcpu switch (code) { case 0x44: return __diag_time_slice_end(vcpu); + case 0x9c: + return __diag_time_slice_end_directed(vcpu); case 0x308: return __diag_ipl_functions(vcpu); default: diff -uprN linux-2.6.32/arch/s390/kvm/interrupt.c linux-2.6.32.ovz/arch/s390/kvm/interrupt.c --- linux-2.6.32/arch/s390/kvm/interrupt.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kvm/interrupt.c 2015-03-10 13:23:55.000000000 -0700 @@ -357,7 +357,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu return 0; } - sltime = ((vcpu->arch.sie_block->ckc - now)*125)>>9; + sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime); diff -uprN linux-2.6.32/arch/s390/kvm/Kconfig linux-2.6.32.ovz/arch/s390/kvm/Kconfig --- linux-2.6.32/arch/s390/kvm/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kvm/Kconfig 2015-03-10 13:24:00.000000000 -0700 @@ -20,7 +20,7 @@ config KVM depends on HAVE_KVM && EXPERIMENTAL select PREEMPT_NOTIFIERS select ANON_INODES - select S390_SWITCH_AMODE + select HAVE_KVM_CPU_RELAX_INTERCEPT ---help--- Support hosting paravirtualized guest machines using the SIE virtualization capability on the mainframe. This should work @@ -34,8 +34,20 @@ config KVM If unsure, say N. +config KVM_AWARE_CMF + depends on KVM + bool "KVM aware sampling" + ---help--- + This option enhances the sampling data from the CPU Measurement + Facility with additional information, that allows to distinguish + guest(s) and host when using the kernel based virtual machine + functionality. + + If unsure, say N. + # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/virtio/Kconfig endif # VIRTUALIZATION diff -uprN linux-2.6.32/arch/s390/kvm/kvm-s390.c linux-2.6.32.ovz/arch/s390/kvm/kvm-s390.c --- linux-2.6.32/arch/s390/kvm/kvm-s390.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kvm/kvm-s390.c 2015-03-10 13:24:04.000000000 -0700 @@ -68,15 +68,17 @@ struct kvm_stats_debugfs_item debugfs_en { "instruction_sigp_set_prefix", VCPU_STAT(instruction_sigp_prefix) }, { "instruction_sigp_restart", VCPU_STAT(instruction_sigp_restart) }, { "diagnose_44", VCPU_STAT(diagnose_44) }, + { "diagnose_9c", VCPU_STAT(diagnose_9c) }, { NULL } }; static unsigned long long *facilities; /* Section: not file related */ -void kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void *garbage) { /* every s390 is virtualization enabled ;-) */ + return 0; } void kvm_arch_hardware_disable(void *garbage) @@ -116,10 +118,16 @@ long kvm_arch_dev_ioctl(struct file *fil int kvm_dev_ioctl_check_extension(long ext) { + int r; + switch (ext) { + case KVM_CAP_S390_PSW: + r = 1; + break; default: - return 0; + r = 0; } + return r; } /* Section: vm related */ @@ -234,6 +242,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm kvm_free_physmem(kvm); free_page((unsigned long)(kvm->arch.sca)); debug_unregister(kvm->arch.dbf); + cleanup_srcu_struct(&kvm->srcu); kfree(kvm); } @@ -250,18 +259,22 @@ void kvm_arch_vcpu_uninit(struct kvm_vcp void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - save_fp_regs(&vcpu->arch.host_fpregs); + save_fp_ctl(&vcpu->arch.host_fpregs.fpc); + save_fp_regs(vcpu->arch.host_fpregs.fprs); save_access_regs(vcpu->arch.host_acrs); vcpu->arch.guest_fpregs.fpc &= FPC_VALID_MASK; - restore_fp_regs(&vcpu->arch.guest_fpregs); + restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc); + restore_fp_regs(vcpu->arch.guest_fpregs.fprs); restore_access_regs(vcpu->arch.guest_acrs); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - save_fp_regs(&vcpu->arch.guest_fpregs); + save_fp_ctl(&vcpu->arch.guest_fpregs.fpc); + save_fp_regs(vcpu->arch.guest_fpregs.fprs); save_access_regs(vcpu->arch.guest_acrs); - restore_fp_regs(&vcpu->arch.host_fpregs); + restore_fp_ctl(&vcpu->arch.host_fpregs.fpc); + restore_fp_regs(vcpu->arch.host_fpregs.fprs); restore_access_regs(vcpu->arch.host_acrs); } @@ -419,8 +432,10 @@ static int kvm_arch_vcpu_ioctl_set_initi vcpu_load(vcpu); if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING) rc = -EBUSY; - else - vcpu->arch.sie_block->gpsw = psw; + else { + vcpu->run->psw_mask = psw.mask; + vcpu->run->psw_addr = psw.addr; + } vcpu_put(vcpu); return rc; } @@ -508,9 +523,6 @@ rerun_vcpu: switch (kvm_run->exit_reason) { case KVM_EXIT_S390_SIEIC: - vcpu->arch.sie_block->gpsw.mask = kvm_run->s390_sieic.mask; - vcpu->arch.sie_block->gpsw.addr = kvm_run->s390_sieic.addr; - break; case KVM_EXIT_UNKNOWN: case KVM_EXIT_INTR: case KVM_EXIT_S390_RESET: @@ -519,6 +531,9 @@ rerun_vcpu: BUG(); } + vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; + vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; + might_fault(); do { @@ -538,8 +553,6 @@ rerun_vcpu: /* intercept cannot be handled in-kernel, prepare kvm-run */ kvm_run->exit_reason = KVM_EXIT_S390_SIEIC; kvm_run->s390_sieic.icptcode = vcpu->arch.sie_block->icptcode; - kvm_run->s390_sieic.mask = vcpu->arch.sie_block->gpsw.mask; - kvm_run->s390_sieic.addr = vcpu->arch.sie_block->gpsw.addr; kvm_run->s390_sieic.ipa = vcpu->arch.sie_block->ipa; kvm_run->s390_sieic.ipb = vcpu->arch.sie_block->ipb; rc = 0; @@ -551,6 +564,9 @@ rerun_vcpu: rc = 0; } + kvm_run->psw_mask = vcpu->arch.sie_block->gpsw.mask; + kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr; + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -679,14 +695,12 @@ long kvm_arch_vcpu_ioctl(struct file *fi } /* Section: memory related */ -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - struct kvm_memory_slot old, - int user_alloc) +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, + int user_alloc) { - int i; - struct kvm_vcpu *vcpu; - /* A few sanity checks. We can have exactly one memory slot which has to start at guest virtual zero and which has to be located at a page boundary in userland and which has to end at a page boundary. @@ -709,14 +723,23 @@ int kvm_arch_set_memory_region(struct kv if (!user_alloc) return -EINVAL; + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + int i; + struct kvm_vcpu *vcpu; + /* request update of sie control block for all available vcpus */ kvm_for_each_vcpu(i, vcpu, kvm) { if (test_and_set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) continue; kvm_s390_inject_sigp_stop(vcpu, ACTION_RELOADVCPU_ON_STOP); } - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) @@ -731,7 +754,7 @@ gfn_t unalias_gfn(struct kvm *kvm, gfn_t static int __init kvm_s390_init(void) { int ret; - ret = kvm_init(NULL, sizeof(struct kvm_vcpu), THIS_MODULE); + ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); if (ret) return ret; diff -uprN linux-2.6.32/arch/s390/kvm/kvm-s390.h linux-2.6.32.ovz/arch/s390/kvm/kvm-s390.h --- linux-2.6.32/arch/s390/kvm/kvm-s390.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kvm/kvm-s390.h 2015-03-10 13:21:08.000000000 -0700 @@ -67,10 +67,14 @@ static inline long kvm_s390_vcpu_get_mem static inline void kvm_s390_vcpu_set_mem(struct kvm_vcpu *vcpu) { + int idx; struct kvm_memory_slot *mem; + struct kvm_memslots *memslots; - down_read(&vcpu->kvm->slots_lock); - mem = &vcpu->kvm->memslots[0]; + idx = srcu_read_lock(&vcpu->kvm->srcu); + memslots = rcu_dereference(vcpu->kvm->memslots); + + mem = &memslots->memslots[0]; vcpu->arch.sie_block->gmsor = mem->userspace_addr; vcpu->arch.sie_block->gmslm = @@ -78,7 +82,7 @@ static inline void kvm_s390_vcpu_set_mem (mem->npages << PAGE_SHIFT) + VIRTIODESCSPACE - 1ul; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); } /* implemented in priv.c */ diff -uprN linux-2.6.32/arch/s390/kvm/sie64a.S linux-2.6.32.ovz/arch/s390/kvm/sie64a.S --- linux-2.6.32/arch/s390/kvm/sie64a.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kvm/sie64a.S 2015-03-10 13:21:51.000000000 -0700 @@ -1,20 +1,60 @@ /* * sie64a.S - low level sie call * - * Copyright IBM Corp. 2008 + * Copyright IBM Corp. 2008,2010 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License (version 2 only) * as published by the Free Software Foundation. * * Author(s): Heiko Carstens + * Christian Ehrhardt */ #include #include +#include +#include +#include +#include + +_TIF_EXIT_SIE = (_TIF_SIGPENDING | _TIF_NEED_RESCHED | _TIF_MCCK_PENDING) + +/* + * offsets into stackframe + * SP_ = offsets into stack sie64 is called with + * SPI_ = offsets into irq stack + */ +SP_GREGS = __SF_EMPTY +SP_HOOK = __SF_EMPTY+8 +SP_GPP = __SF_EMPTY+16 +SPI_PSW = STACK_FRAME_OVERHEAD + __PT_PSW + -SP_R5 = 5 * 8 # offset into stackframe -SP_R6 = 6 * 8 + .macro SPP newpp +#ifdef CONFIG_KVM_AWARE_CMF + tm __LC_MACHINE_FLAGS+6,0x20 # MACHINE_FLAG_SPP + jz 0f + .insn s,0xb2800000,\newpp + 0: +#endif + .endm + +sie_irq_handler: + SPP __LC_CMF_HPP # set host id + larl %r2,sie_inst + clg %r2,SPI_PSW+8(0,%r15) # intercepted sie + jne 1f + xc __LC_SIE_HOOK(8),__LC_SIE_HOOK + lg %r2,__LC_THREAD_INFO # pointer thread_info struct + tm __TI_flags+7(%r2),_TIF_EXIT_SIE + jz 0f + larl %r2,sie_exit # work pending, leave sie + stg %r2,__LC_RETURN_PSW+8 + br %r14 +0: larl %r2,sie_reenter # re-enter with guest id + stg %r2,__LC_RETURN_PSW+8 +1: br %r14 /* * sie64a calling convention: @@ -23,23 +63,34 @@ SP_R6 = 6 * 8 */ .globl sie64a sie64a: - lgr %r5,%r3 - stmg %r5,%r14,SP_R5(%r15) # save register on entry - lgr %r14,%r2 # pointer to sie control block - lmg %r0,%r13,0(%r3) # load guest gprs 0-13 + stg %r3,SP_GREGS(%r15) # save guest register save area + stmg %r6,%r14,__SF_GPRS(%r15) # save registers on entry + lgr %r14,%r2 # pointer to sie control block + larl %r5,sie_irq_handler + stg %r2,SP_GPP(%r15) + stg %r5,SP_HOOK(%r15) # save hook target + lmg %r0,%r13,0(%r3) # load guest gprs 0-13 +sie_reenter: + mvc __LC_SIE_HOOK(8),SP_HOOK(%r15) + SPP SP_GPP(%r15) # set guest id sie_inst: sie 0(%r14) - lg %r14,SP_R5(%r15) - stmg %r0,%r13,0(%r14) # save guest gprs 0-13 + xc __LC_SIE_HOOK(8),__LC_SIE_HOOK + SPP __LC_CMF_HPP # set host id +sie_exit: + lg %r14,SP_GREGS(%r15) + stmg %r0,%r13,0(%r14) # save guest gprs 0-13 lghi %r2,0 - lmg %r6,%r14,SP_R6(%r15) + lmg %r6,%r14,__SF_GPRS(%r15) br %r14 sie_err: - lg %r14,SP_R5(%r15) - stmg %r0,%r13,0(%r14) # save guest gprs 0-13 + xc __LC_SIE_HOOK(8),__LC_SIE_HOOK + SPP __LC_CMF_HPP # set host id + lg %r14,SP_GREGS(%r15) + stmg %r0,%r13,0(%r14) # save guest gprs 0-13 lghi %r2,-EFAULT - lmg %r6,%r14,SP_R6(%r15) + lmg %r6,%r14,__SF_GPRS(%r15) br %r14 .section __ex_table,"a" diff -uprN linux-2.6.32/arch/s390/kvm/sigp.c linux-2.6.32.ovz/arch/s390/kvm/sigp.c --- linux-2.6.32/arch/s390/kvm/sigp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/kvm/sigp.c 2015-03-10 13:21:03.000000000 -0700 @@ -188,9 +188,9 @@ static int __sigp_set_prefix(struct kvm_ /* make sure that the new value is valid memory */ address = address & 0x7fffe000u; - if ((copy_from_guest(vcpu, &tmp, - (u64) (address + vcpu->arch.sie_block->gmsor) , 1)) || - (copy_from_guest(vcpu, &tmp, (u64) (address + + if ((copy_from_user(&tmp, (void __user *) + (address + vcpu->arch.sie_block->gmsor) , 1)) || + (copy_from_user(&tmp, (void __user *)(address + vcpu->arch.sie_block->gmsor + PAGE_SIZE), 1))) { *reg |= SIGP_STAT_INVALID_PARAMETER; return 1; /* invalid parameter */ diff -uprN linux-2.6.32/arch/s390/lib/delay.c linux-2.6.32.ovz/arch/s390/lib/delay.c --- linux-2.6.32/arch/s390/lib/delay.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/lib/delay.c 2015-03-10 13:22:07.000000000 -0700 @@ -29,17 +29,21 @@ static void __udelay_disabled(unsigned l { unsigned long mask, cr0, cr0_saved; u64 clock_saved; + u64 end; + mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_EXT; + end = get_clock() + (usecs << 12); clock_saved = local_tick_disable(); - set_clock_comparator(get_clock() + (usecs << 12)); __ctl_store(cr0_saved, 0, 0); cr0 = (cr0_saved & 0xffff00e0) | 0x00000800; __ctl_load(cr0 , 0, 0); - mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_EXT; lockdep_off(); - trace_hardirqs_on(); - __load_psw_mask(mask); - local_irq_disable(); + do { + set_clock_comparator(end); + trace_hardirqs_on(); + __load_psw_mask(mask); + local_irq_disable(); + } while (get_clock() < end); lockdep_on(); __ctl_load(cr0_saved, 0, 0); local_tick_enable(clock_saved); diff -uprN linux-2.6.32/arch/s390/lib/spinlock.c linux-2.6.32.ovz/arch/s390/lib/spinlock.c --- linux-2.6.32/arch/s390/lib/spinlock.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/lib/spinlock.c 2015-03-10 13:22:07.000000000 -0700 @@ -34,7 +34,7 @@ static inline void _raw_yield_cpu(int cp { if (MACHINE_HAS_DIAG9C) asm volatile("diag %0,0,0x9c" - : : "d" (__cpu_logical_map[cpu])); + : : "d" (cpu_logical_map(cpu))); else _raw_yield(); } diff -uprN linux-2.6.32/arch/s390/lib/uaccess_mvcos.c linux-2.6.32.ovz/arch/s390/lib/uaccess_mvcos.c --- linux-2.6.32/arch/s390/lib/uaccess_mvcos.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/lib/uaccess_mvcos.c 2015-03-10 13:24:09.000000000 -0700 @@ -162,7 +162,6 @@ static size_t clear_user_mvcos(size_t si return size; } -#ifdef CONFIG_S390_SWITCH_AMODE static size_t strnlen_user_mvcos(size_t count, const char __user *src) { char buf[256]; @@ -200,7 +199,77 @@ static size_t strncpy_from_user_mvcos(si } while ((len_str == len) && (done < count)); return done; } -#endif /* CONFIG_S390_SWITCH_AMODE */ + +#define __futex_atomic_op(insn, ret, oldval, newval, uaddr, oparg) \ + asm volatile( \ + " sacf 256\n" \ + "0: l %1,0(%6)\n" \ + "1:"insn \ + "2: cs %1,%2,0(%6)\n" \ + "3: jl 1b\n" \ + " lhi %0,0\n" \ + "4: sacf 768\n" \ + EX_TABLE(0b,4b) EX_TABLE(2b,4b) EX_TABLE(3b,4b) \ + : "=d" (ret), "=&d" (oldval), "=&d" (newval), \ + "=m" (*uaddr) \ + : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ + "m" (*uaddr) : "cc"); + +static int futex_atomic_op_mvcos(int op, int __user *uaddr, int oparg, int *old) +{ + int oldval = 0, newval, ret; + unsigned long asce; + + __ctl_store(asce, 1, 1); + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + switch (op) { + case FUTEX_OP_SET: + __futex_atomic_op("lr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_ADD: + __futex_atomic_op("lr %2,%1\nar %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_OR: + __futex_atomic_op("lr %2,%1\nor %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_ANDN: + __futex_atomic_op("lr %2,%1\nnr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + case FUTEX_OP_XOR: + __futex_atomic_op("lr %2,%1\nxr %2,%5\n", + ret, oldval, newval, uaddr, oparg); + break; + default: + ret = -ENOSYS; + } + *old = oldval; + __ctl_load(asce, 1, 1); + return ret; +} + +static int futex_atomic_cmpxchg_mvcos(int __user *uaddr, int oldval, int newval) +{ + unsigned long asce; + int ret; + + __ctl_store(asce, 1, 1); + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + asm volatile( + " sacf 256\n" + "0: cs %1,%4,0(%5)\n" + "1: lr %0,%1\n" + "2: sacf 768\n" + EX_TABLE(0b,2b) EX_TABLE(1b,2b) + : "=d" (ret), "+d" (oldval), "=m" (*uaddr) + : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) + : "cc", "memory" ); + __ctl_load(asce, 1, 1); + return ret; +} struct uaccess_ops uaccess_mvcos = { .copy_from_user = copy_from_user_mvcos_check, @@ -215,7 +284,6 @@ struct uaccess_ops uaccess_mvcos = { .futex_atomic_cmpxchg = futex_atomic_cmpxchg_std, }; -#ifdef CONFIG_S390_SWITCH_AMODE struct uaccess_ops uaccess_mvcos_switch = { .copy_from_user = copy_from_user_mvcos, .copy_from_user_small = copy_from_user_mvcos, @@ -225,7 +293,6 @@ struct uaccess_ops uaccess_mvcos_switch .clear_user = clear_user_mvcos, .strnlen_user = strnlen_user_mvcos, .strncpy_from_user = strncpy_from_user_mvcos, - .futex_atomic_op = futex_atomic_op_pt, - .futex_atomic_cmpxchg = futex_atomic_cmpxchg_pt, + .futex_atomic_op = futex_atomic_op_mvcos, + .futex_atomic_cmpxchg = futex_atomic_cmpxchg_mvcos, }; -#endif diff -uprN linux-2.6.32/arch/s390/lib/uaccess_pt.c linux-2.6.32.ovz/arch/s390/lib/uaccess_pt.c --- linux-2.6.32/arch/s390/lib/uaccess_pt.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/lib/uaccess_pt.c 2015-03-10 13:23:43.000000000 -0700 @@ -4,129 +4,82 @@ * User access functions based on page table walks for enhanced * system layout without hardware support. * - * Copyright IBM Corp. 2006 + * Copyright IBM Corp. 2006, 2012 * Author(s): Gerald Schaefer (gerald.schaefer@de.ibm.com) */ #include #include #include +#include #include #include #include "uaccess.h" -static inline pte_t *follow_table(struct mm_struct *mm, unsigned long addr) + +/* + * Returns kernel address for user virtual address. If the returned address is + * >= -4095 (IS_ERR_VALUE(x) returns true), a fault has occured and the address + * contains the (negative) exception code. + */ +static __always_inline unsigned long follow_table(struct mm_struct *mm, + unsigned long addr, int write) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; + pte_t *ptep; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - return NULL; + return -0x3aUL; pud = pud_offset(pgd, addr); if (pud_none(*pud) || unlikely(pud_bad(*pud))) - return NULL; + return -0x3bUL; pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) - return NULL; + if (pmd_none(*pmd)) + return -0x10UL; + if (pmd_huge(*pmd)) { + if (write && (pmd_val(*pmd) & _SEGMENT_ENTRY_RO)) + return -0x04UL; + return (pmd_val(*pmd) & HPAGE_MASK) + (addr & ~HPAGE_MASK); + } + if (unlikely(pmd_bad(*pmd))) + return -0x10UL; + + ptep = pte_offset_map(pmd, addr); + if (!pte_present(*ptep)) + return -0x11UL; + if (write && !pte_write(*ptep)) + return -0x04UL; - return pte_offset_map(pmd, addr); + return (pte_val(*ptep) & PAGE_MASK) + (addr & ~PAGE_MASK); } -static int __handle_fault(struct mm_struct *mm, unsigned long address, - int write_access) -{ - struct vm_area_struct *vma; - int ret = -EFAULT; - int fault; - - if (in_atomic()) - return ret; - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); - if (unlikely(!vma)) - goto out; - if (unlikely(vma->vm_start > address)) { - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto out; - if (expand_stack(vma, address)) - goto out; - } - - if (!write_access) { - /* page not present, check vm flags */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto out; - } else { - if (!(vma->vm_flags & VM_WRITE)) - goto out; - } - -survive: - fault = handle_mm_fault(mm, vma, address, write_access ? FAULT_FLAG_WRITE : 0); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) - goto out_of_memory; - else if (fault & VM_FAULT_SIGBUS) - goto out_sigbus; - BUG(); - } - if (fault & VM_FAULT_MAJOR) - current->maj_flt++; - else - current->min_flt++; - ret = 0; -out: - up_read(&mm->mmap_sem); - return ret; - -out_of_memory: - up_read(&mm->mmap_sem); - if (is_global_init(current)) { - yield(); - down_read(&mm->mmap_sem); - goto survive; - } - printk("VM: killing process %s\n", current->comm); - return ret; - -out_sigbus: - up_read(&mm->mmap_sem); - current->thread.prot_addr = address; - current->thread.trap_no = 0x11; - force_sig(SIGBUS, current); - return ret; -} - -static size_t __user_copy_pt(unsigned long uaddr, void *kptr, - size_t n, int write_user) +static __always_inline size_t __user_copy_pt(unsigned long uaddr, void *kptr, + size_t n, int write_user) { struct mm_struct *mm = current->mm; - unsigned long offset, pfn, done, size; - pte_t *pte; + unsigned long offset, done, size, kaddr; void *from, *to; done = 0; retry: spin_lock(&mm->page_table_lock); do { - pte = follow_table(mm, uaddr); - if (!pte || !pte_present(*pte) || - (write_user && !pte_write(*pte))) + kaddr = follow_table(mm, uaddr, write_user); + if (IS_ERR_VALUE(kaddr)) goto fault; - pfn = pte_pfn(*pte); - - offset = uaddr & (PAGE_SIZE - 1); + offset = uaddr & ~PAGE_MASK; size = min(n - done, PAGE_SIZE - offset); if (write_user) { - to = (void *)((pfn << PAGE_SHIFT) + offset); + to = (void *) kaddr; from = kptr + done; } else { - from = (void *)((pfn << PAGE_SHIFT) + offset); + from = (void *) kaddr; to = kptr + done; } memcpy(to, from, size); @@ -137,7 +90,7 @@ retry: return n - done; fault: spin_unlock(&mm->page_table_lock); - if (__handle_fault(mm, uaddr, write_user)) + if (__handle_fault(uaddr, -kaddr, write_user)) return n - done; goto retry; } @@ -146,30 +99,26 @@ fault: * Do DAT for user address by page table walk, return kernel address. * This function needs to be called with current->mm->page_table_lock held. */ -static unsigned long __dat_user_addr(unsigned long uaddr) +static __always_inline unsigned long __dat_user_addr(unsigned long uaddr, + int write) { struct mm_struct *mm = current->mm; - unsigned long pfn, ret; - pte_t *pte; + unsigned long kaddr; int rc; - ret = 0; retry: - pte = follow_table(mm, uaddr); - if (!pte || !pte_present(*pte)) + kaddr = follow_table(mm, uaddr, write); + if (IS_ERR_VALUE(kaddr)) goto fault; - pfn = pte_pfn(*pte); - ret = (pfn << PAGE_SHIFT) + (uaddr & (PAGE_SIZE - 1)); -out: - return ret; + return kaddr; fault: spin_unlock(&mm->page_table_lock); - rc = __handle_fault(mm, uaddr, 0); + rc = __handle_fault(uaddr, -kaddr, write); spin_lock(&mm->page_table_lock); - if (rc) - goto out; - goto retry; + if (!rc) + goto retry; + return 0; } size_t copy_from_user_pt(size_t n, const void __user *from, void *to) @@ -220,11 +169,9 @@ static size_t clear_user_pt(size_t n, vo static size_t strnlen_user_pt(size_t count, const char __user *src) { - char *addr; unsigned long uaddr = (unsigned long) src; struct mm_struct *mm = current->mm; - unsigned long offset, pfn, done, len; - pte_t *pte; + unsigned long offset, done, len, kaddr; size_t len_str; if (segment_eq(get_fs(), KERNEL_DS)) @@ -233,15 +180,13 @@ static size_t strnlen_user_pt(size_t cou retry: spin_lock(&mm->page_table_lock); do { - pte = follow_table(mm, uaddr); - if (!pte || !pte_present(*pte)) + kaddr = follow_table(mm, uaddr, 0); + if (IS_ERR_VALUE(kaddr)) goto fault; - pfn = pte_pfn(*pte); - offset = uaddr & (PAGE_SIZE-1); - addr = (char *)(pfn << PAGE_SHIFT) + offset; + offset = uaddr & ~PAGE_MASK; len = min(count - done, PAGE_SIZE - offset); - len_str = strnlen(addr, len); + len_str = strnlen((char *) kaddr, len); done += len_str; uaddr += len_str; } while ((len_str == len) && (done < count)); @@ -249,9 +194,8 @@ retry: return done + 1; fault: spin_unlock(&mm->page_table_lock); - if (__handle_fault(mm, uaddr, 0)) { + if (__handle_fault(uaddr, -kaddr, 0)) return 0; - } goto retry; } @@ -283,11 +227,10 @@ static size_t copy_in_user_pt(size_t n, const void __user *from) { struct mm_struct *mm = current->mm; - unsigned long offset_from, offset_to, offset_max, pfn_from, pfn_to, - uaddr, done, size; + unsigned long offset_max, uaddr, done, size, error_code; unsigned long uaddr_from = (unsigned long) from; unsigned long uaddr_to = (unsigned long) to; - pte_t *pte_from, *pte_to; + unsigned long kaddr_to, kaddr_from; int write_user; if (segment_eq(get_fs(), KERNEL_DS)) { @@ -298,29 +241,25 @@ static size_t copy_in_user_pt(size_t n, retry: spin_lock(&mm->page_table_lock); do { - pte_from = follow_table(mm, uaddr_from); - if (!pte_from || !pte_present(*pte_from)) { - uaddr = uaddr_from; - write_user = 0; + write_user = 0; + uaddr = uaddr_from; + kaddr_from = follow_table(mm, uaddr_from, 0); + error_code = kaddr_from; + if (IS_ERR_VALUE(error_code)) goto fault; - } - pte_to = follow_table(mm, uaddr_to); - if (!pte_to || !pte_present(*pte_to) || !pte_write(*pte_to)) { - uaddr = uaddr_to; - write_user = 1; + write_user = 1; + uaddr = uaddr_to; + kaddr_to = follow_table(mm, uaddr_to, 1); + error_code = (unsigned long) kaddr_to; + if (IS_ERR_VALUE(error_code)) goto fault; - } - pfn_from = pte_pfn(*pte_from); - pfn_to = pte_pfn(*pte_to); - offset_from = uaddr_from & (PAGE_SIZE-1); - offset_to = uaddr_from & (PAGE_SIZE-1); - offset_max = max(offset_from, offset_to); + offset_max = max(uaddr_from & ~PAGE_MASK, + uaddr_to & ~PAGE_MASK); size = min(n - done, PAGE_SIZE - offset_max); - memcpy((void *)(pfn_to << PAGE_SHIFT) + offset_to, - (void *)(pfn_from << PAGE_SHIFT) + offset_from, size); + memcpy((void *) kaddr_to, (void *) kaddr_from, size); done += size; uaddr_from += size; uaddr_to += size; @@ -329,7 +268,7 @@ retry: return n - done; fault: spin_unlock(&mm->page_table_lock); - if (__handle_fault(mm, uaddr, write_user)) + if (__handle_fault(uaddr, -error_code, write_user)) return n - done; goto retry; } @@ -387,7 +326,7 @@ int futex_atomic_op_pt(int op, int __use if (segment_eq(get_fs(), KERNEL_DS)) return __futex_atomic_op_pt(op, uaddr, oparg, old); spin_lock(¤t->mm->page_table_lock); - uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr); + uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr, 1); if (!uaddr) { spin_unlock(¤t->mm->page_table_lock); return -EFAULT; @@ -420,7 +359,7 @@ int futex_atomic_cmpxchg_pt(int __user * if (segment_eq(get_fs(), KERNEL_DS)) return __futex_atomic_cmpxchg_pt(uaddr, oldval, newval); spin_lock(¤t->mm->page_table_lock); - uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr); + uaddr = (int __user *) __dat_user_addr((unsigned long) uaddr, 1); if (!uaddr) { spin_unlock(¤t->mm->page_table_lock); return -EFAULT; diff -uprN linux-2.6.32/arch/s390/lib/uaccess_std.c linux-2.6.32.ovz/arch/s390/lib/uaccess_std.c --- linux-2.6.32/arch/s390/lib/uaccess_std.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/lib/uaccess_std.c 2015-03-10 13:24:09.000000000 -0700 @@ -71,14 +71,6 @@ size_t copy_from_user_std(size_t size, c return size; } -static size_t copy_from_user_std_check(size_t size, const void __user *ptr, - void *x) -{ - if (size <= 1024) - return copy_from_user_std(size, ptr, x); - return copy_from_user_pt(size, ptr, x); -} - size_t copy_to_user_std(size_t size, void __user *ptr, const void *x) { unsigned long tmp1, tmp2; @@ -111,23 +103,15 @@ size_t copy_to_user_std(size_t size, voi return size; } -static size_t copy_to_user_std_check(size_t size, void __user *ptr, - const void *x) -{ - if (size <= 1024) - return copy_to_user_std(size, ptr, x); - return copy_to_user_pt(size, ptr, x); -} - static size_t copy_in_user_std(size_t size, void __user *to, const void __user *from) { unsigned long tmp1; asm volatile( + " sacf 256\n" " "AHI" %0,-1\n" " jo 5f\n" - " sacf 256\n" " bras %3,3f\n" "0:"AHI" %0,257\n" "1: mvc 0(1,%1),0(%2)\n" @@ -142,9 +126,8 @@ static size_t copy_in_user_std(size_t si "3:"AHI" %0,-256\n" " jnm 2b\n" "4: ex %0,1b-0b(%3)\n" - " sacf 0\n" "5: "SLR" %0,%0\n" - "6:\n" + "6: sacf 0\n" EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1) : : "cc", "memory"); @@ -156,9 +139,9 @@ static size_t clear_user_std(size_t size unsigned long tmp1, tmp2; asm volatile( + " sacf 256\n" " "AHI" %0,-1\n" " jo 5f\n" - " sacf 256\n" " bras %3,3f\n" " xc 0(1,%1),0(%1)\n" "0:"AHI" %0,257\n" @@ -178,9 +161,8 @@ static size_t clear_user_std(size_t size "3:"AHI" %0,-256\n" " jnm 2b\n" "4: ex %0,0(%3)\n" - " sacf 0\n" "5: "SLR" %0,%0\n" - "6:\n" + "6: sacf 0\n" EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) : "+a" (size), "+a" (to), "=a" (tmp1), "=a" (tmp2) : : "cc", "memory"); @@ -306,9 +288,9 @@ int futex_atomic_cmpxchg_std(int __user } struct uaccess_ops uaccess_std = { - .copy_from_user = copy_from_user_std_check, + .copy_from_user = copy_from_user_std, .copy_from_user_small = copy_from_user_std, - .copy_to_user = copy_to_user_std_check, + .copy_to_user = copy_to_user_std, .copy_to_user_small = copy_to_user_std, .copy_in_user = copy_in_user_std, .clear_user = clear_user_std, diff -uprN linux-2.6.32/arch/s390/Makefile linux-2.6.32.ovz/arch/s390/Makefile --- linux-2.6.32/arch/s390/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/Makefile 2015-03-10 13:21:33.000000000 -0700 @@ -110,6 +110,12 @@ image: vmlinux zfcpdump: $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ +vdso_install: +ifeq ($(CONFIG_64BIT),y) + $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso64 $@ +endif + $(Q)$(MAKE) $(build)=arch/$(ARCH)/kernel/vdso32 $@ + archclean: $(Q)$(MAKE) $(clean)=$(boot) diff -uprN linux-2.6.32/arch/s390/math-emu/math.c linux-2.6.32.ovz/arch/s390/math-emu/math.c --- linux-2.6.32/arch/s390/math-emu/math.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/math-emu/math.c 2015-03-10 13:24:04.000000000 -0700 @@ -21,6 +21,8 @@ #include #include +#define FPC_VALID_MASK 0xF8F8FF03 + /* * I miss a macro to round a floating point number to the * nearest integer in the same floating point format. diff -uprN linux-2.6.32/arch/s390/mm/cmm.c linux-2.6.32.ovz/arch/s390/mm/cmm.c --- linux-2.6.32/arch/s390/mm/cmm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/cmm.c 2015-03-10 13:23:04.000000000 -0700 @@ -18,12 +18,16 @@ #include #include #include +#include #include #include #include -static char *sender = "VMRMSVM"; +#ifdef CONFIG_CMM_IUCV +static char *cmm_default_sender = "VMRMSVM"; +#endif +static char *sender; module_param(sender, charp, 0400); MODULE_PARM_DESC(sender, "Guest name that may send SMSG messages (default VMRMSVM)"); @@ -44,14 +48,15 @@ static volatile long cmm_pages_target; static volatile long cmm_timed_pages_target; static long cmm_timeout_pages; static long cmm_timeout_seconds; +static int cmm_suspended; static struct cmm_page_array *cmm_page_list; static struct cmm_page_array *cmm_timed_page_list; static DEFINE_SPINLOCK(cmm_lock); static struct task_struct *cmm_thread_ptr; -static wait_queue_head_t cmm_thread_wait; -static struct timer_list cmm_timer; +static DECLARE_WAIT_QUEUE_HEAD(cmm_thread_wait); +static DEFINE_TIMER(cmm_timer, NULL, 0, 0); static void cmm_timer_fn(unsigned long); static void cmm_set_timer(void); @@ -87,7 +92,7 @@ cmm_alloc_pages(long nr, long *counter, } else free_page((unsigned long) npa); } - diag10(addr); + diag10_range(addr >> PAGE_SHIFT, 1); pa->pages[pa->index++] = addr; (*counter)++; spin_unlock(&cmm_lock); @@ -147,9 +152,9 @@ cmm_thread(void *dummy) while (1) { rc = wait_event_interruptible(cmm_thread_wait, - (cmm_pages != cmm_pages_target || - cmm_timed_pages != cmm_timed_pages_target || - kthread_should_stop())); + (!cmm_suspended && (cmm_pages != cmm_pages_target || + cmm_timed_pages != cmm_timed_pages_target)) || + kthread_should_stop()); if (kthread_should_stop() || rc == -ERESTARTSYS) { cmm_pages_target = cmm_pages; cmm_timed_pages_target = cmm_timed_pages; @@ -373,7 +378,7 @@ static struct ctl_table cmm_dir_table[] #ifdef CONFIG_CMM_IUCV #define SMSG_PREFIX "CMM" static void -cmm_smsg_target(char *from, char *msg) +cmm_smsg_target(const char *from, char *msg) { long nr, seconds; @@ -411,6 +416,38 @@ cmm_smsg_target(char *from, char *msg) static struct ctl_table_header *cmm_sysctl_header; +static int cmm_suspend(void) +{ + cmm_suspended = 1; + cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); + cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); + return 0; +} + +static int cmm_resume(void) +{ + cmm_suspended = 0; + cmm_kick_thread(); + return 0; +} + +static int cmm_power_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + switch (event) { + case PM_POST_HIBERNATION: + return cmm_resume(); + case PM_HIBERNATION_PREPARE: + return cmm_suspend(); + default: + return NOTIFY_DONE; + } +} + +static struct notifier_block cmm_power_notifier = { + .notifier_call = cmm_power_event, +}; + static int cmm_init (void) { @@ -419,9 +456,18 @@ cmm_init (void) #ifdef CONFIG_CMM_PROC cmm_sysctl_header = register_sysctl_table(cmm_dir_table); if (!cmm_sysctl_header) - goto out; + goto out_sysctl; #endif #ifdef CONFIG_CMM_IUCV + /* convert sender to uppercase characters */ + if (sender) { + int len = strlen(sender); + while (len--) + sender[len] = toupper(sender[len]); + } else { + sender = cmm_default_sender; + } + rc = smsg_register_callback(SMSG_PREFIX, cmm_smsg_target); if (rc < 0) goto out_smsg; @@ -429,17 +475,19 @@ cmm_init (void) rc = register_oom_notifier(&cmm_oom_nb); if (rc < 0) goto out_oom_notify; - init_waitqueue_head(&cmm_thread_wait); - init_timer(&cmm_timer); + rc = register_pm_notifier(&cmm_power_notifier); + if (rc) + goto out_pm; cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); rc = IS_ERR(cmm_thread_ptr) ? PTR_ERR(cmm_thread_ptr) : 0; - if (!rc) - goto out; - /* - * kthread_create failed. undo all the stuff from above again. - */ - unregister_oom_notifier(&cmm_oom_nb); + if (rc) + goto out_kthread; + return 0; +out_kthread: + unregister_pm_notifier(&cmm_power_notifier); +out_pm: + unregister_oom_notifier(&cmm_oom_nb); out_oom_notify: #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); @@ -447,24 +495,27 @@ out_smsg: #endif #ifdef CONFIG_CMM_PROC unregister_sysctl_table(cmm_sysctl_header); +out_sysctl: #endif -out: + del_timer_sync(&cmm_timer); return rc; } static void cmm_exit(void) { - kthread_stop(cmm_thread_ptr); - unregister_oom_notifier(&cmm_oom_nb); - cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); - cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); #ifdef CONFIG_CMM_PROC unregister_sysctl_table(cmm_sysctl_header); #endif #ifdef CONFIG_CMM_IUCV smsg_unregister_callback(SMSG_PREFIX, cmm_smsg_target); #endif + unregister_pm_notifier(&cmm_power_notifier); + unregister_oom_notifier(&cmm_oom_nb); + kthread_stop(cmm_thread_ptr); + del_timer_sync(&cmm_timer); + cmm_free_pages(cmm_pages, &cmm_pages, &cmm_page_list); + cmm_free_pages(cmm_timed_pages, &cmm_timed_pages, &cmm_timed_page_list); } module_init(cmm_init); diff -uprN linux-2.6.32/arch/s390/mm/fault.c linux-2.6.32.ovz/arch/s390/mm/fault.c --- linux-2.6.32/arch/s390/mm/fault.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/fault.c 2015-03-10 13:24:15.000000000 -0700 @@ -30,20 +30,20 @@ #include #include #include +#include #include #include #include #include +#include #include "../kernel/entry.h" #ifndef CONFIG_64BIT #define __FAIL_ADDR_MASK 0x7ffff000 -#define __FIXUP_MASK 0x7fffffff #define __SUBCODE_MASK 0x0200 #define __PF_RES_FIELD 0ULL #else /* CONFIG_64BIT */ #define __FAIL_ADDR_MASK -4096L -#define __FIXUP_MASK ~0L #define __SUBCODE_MASK 0x0600 #define __PF_RES_FIELD 0x8000000000000000ULL #endif /* CONFIG_64BIT */ @@ -52,11 +52,15 @@ extern int sysctl_userprocess_debug; #endif -#ifdef CONFIG_KPROBES -static inline int notify_page_fault(struct pt_regs *regs, long err) +#define VM_FAULT_BADCONTEXT 0x010000 +#define VM_FAULT_BADMAP 0x020000 +#define VM_FAULT_BADACCESS 0x040000 + +static inline int notify_page_fault(struct pt_regs *regs) { int ret = 0; +#ifdef CONFIG_KPROBES /* kprobe_running() needs smp_processor_id() */ if (!user_mode(regs)) { preempt_disable(); @@ -64,15 +68,9 @@ static inline int notify_page_fault(stru ret = 1; preempt_enable(); } - +#endif return ret; } -#else -static inline int notify_page_fault(struct pt_regs *regs, long err) -{ - return 0; -} -#endif /* @@ -100,57 +98,50 @@ void bust_spinlocks(int yes) /* * Returns the address space associated with the fault. - * Returns 0 for kernel space, 1 for user space and - * 2 for code execution in user space with noexec=on. + * Returns 0 for kernel space and 1 for user space. */ -static inline int check_space(struct task_struct *tsk) +static inline int user_space_fault(unsigned long trans_exc_code) { /* - * The lowest two bits of S390_lowcore.trans_exc_code - * indicate which paging table was used. + * The lowest two bits of the translation exception + * identification indicate which paging table was used. */ - int desc = S390_lowcore.trans_exc_code & 3; - - if (desc == 3) /* Home Segment Table Descriptor */ - return switch_amode == 0; - if (desc == 2) /* Secondary Segment Table Descriptor */ - return tsk->thread.mm_segment.ar4; -#ifdef CONFIG_S390_SWITCH_AMODE - if (unlikely(desc == 1)) { /* STD determined via access register */ - /* %a0 always indicates primary space. */ - if (S390_lowcore.exc_access_id != 0) { - save_access_regs(tsk->thread.acrs); - /* - * An alet of 0 indicates primary space. - * An alet of 1 indicates secondary space. - * Any other alet values generate an - * alen-translation exception. - */ - if (tsk->thread.acrs[S390_lowcore.exc_access_id]) - return tsk->thread.mm_segment.ar4; - } - } -#endif - /* Primary Segment Table Descriptor */ - return switch_amode << s390_noexec; + trans_exc_code &= 3; + if (trans_exc_code == 2) + /* Access via secondary space, set_fs setting decides */ + return current->thread.mm_segment.ar4; + if (user_mode == HOME_SPACE_MODE) + /* User space if the access has been done via home space. */ + return trans_exc_code == 3; + /* + * If the user space is not the home space the kernel runs in home + * space. Access via secondary space has already been covered, + * access via primary space or access register is from user space + * and access via home space is from the kernel. + */ + return trans_exc_code != 3; } /* * Send SIGSEGV to task. This is an external routine * to keep the stack usage of do_page_fault small. */ -static void do_sigsegv(struct pt_regs *regs, unsigned long error_code, - int si_code, unsigned long address) +static noinline void do_sigsegv(struct pt_regs *regs, long int_code, + int si_code, unsigned long trans_exc_code) { struct siginfo si; + unsigned long address; + address = trans_exc_code & __FAIL_ADDR_MASK; + current->thread.prot_addr = address; + current->thread.trap_no = int_code; #if defined(CONFIG_SYSCTL) || defined(CONFIG_PROCESS_DEBUG) #if defined(CONFIG_SYSCTL) if (sysctl_userprocess_debug) #endif { printk("User process fault: interruption code 0x%lX\n", - error_code); + int_code); printk("failing address: %lX\n", address); show_regs(regs); } @@ -161,13 +152,14 @@ static void do_sigsegv(struct pt_regs *r force_sig_info(SIGSEGV, &si, current); } -static void do_no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static noinline void do_no_context(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { const struct exception_table_entry *fixup; + unsigned long address; /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->psw.addr & __FIXUP_MASK); + fixup = search_exception_tables(regs->psw.addr & PSW_ADDR_INSN); if (fixup) { regs->psw.addr = fixup->fixup | PSW_ADDR_AMODE; return; @@ -177,174 +169,196 @@ static void do_no_context(struct pt_regs * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - if (check_space(current) == 0) + address = trans_exc_code & __FAIL_ADDR_MASK; + if (!user_space_fault(trans_exc_code)) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " at virtual kernel address %p\n", (void *)address); else printk(KERN_ALERT "Unable to handle kernel paging request" " at virtual user address %p\n", (void *)address); - die("Oops", regs, error_code); + die("Oops", regs, int_code); do_exit(SIGKILL); } -static void do_low_address(struct pt_regs *regs, unsigned long error_code) +static noinline void do_low_address(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { /* Low-address protection hit in kernel mode means NULL pointer write access in kernel mode. */ if (regs->psw.mask & PSW_MASK_PSTATE) { /* Low-address protection hit in user mode 'cannot happen'. */ - die ("Low-address protection", regs, error_code); + die ("Low-address protection", regs, int_code); do_exit(SIGKILL); } - do_no_context(regs, error_code, 0); + do_no_context(regs, int_code, trans_exc_code); } -static void do_sigbus(struct pt_regs *regs, unsigned long error_code, - unsigned long address) +static noinline void do_sigbus(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; + unsigned long address; + struct siginfo si; - up_read(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. */ + address = trans_exc_code & __FAIL_ADDR_MASK; tsk->thread.prot_addr = address; - tsk->thread.trap_no = error_code; - force_sig(SIGBUS, tsk); - - /* Kernel mode? Handle exceptions or die */ - if (!(regs->psw.mask & PSW_MASK_PSTATE)) - do_no_context(regs, error_code, address); + tsk->thread.trap_no = int_code; + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_ADRERR; + si.si_addr = (void __user *) address; + force_sig_info(SIGBUS, &si, tsk); } #ifdef CONFIG_S390_EXEC_PROTECT -static int signal_return(struct mm_struct *mm, struct pt_regs *regs, - unsigned long address, unsigned long error_code) +static noinline int signal_return(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code) { u16 instruction; int rc; -#ifdef CONFIG_COMPAT - int compat; -#endif - pagefault_disable(); rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - pagefault_enable(); - if (rc) - return -EFAULT; - up_read(&mm->mmap_sem); - clear_tsk_thread_flag(current, TIF_SINGLE_STEP); -#ifdef CONFIG_COMPAT - compat = is_compat_task(); - if (compat && instruction == 0x0a77) - sys32_sigreturn(); - else if (compat && instruction == 0x0aad) - sys32_rt_sigreturn(); - else -#endif - if (instruction == 0x0a77) - sys_sigreturn(); - else if (instruction == 0x0aad) - sys_rt_sigreturn(); - else { - current->thread.prot_addr = address; - current->thread.trap_no = error_code; - do_sigsegv(regs, error_code, SEGV_MAPERR, address); - } + if (!rc && instruction == 0x0a77) { + clear_tsk_thread_flag(current, TIF_SINGLE_STEP); + if (is_compat_task()) + sys32_sigreturn(); + else + sys_sigreturn(); + } else if (!rc && instruction == 0x0aad) { + clear_tsk_thread_flag(current, TIF_SINGLE_STEP); + if (is_compat_task()) + sys32_rt_sigreturn(); + else + sys_rt_sigreturn(); + } else + do_sigsegv(regs, int_code, SEGV_MAPERR, trans_exc_code); return 0; } #endif /* CONFIG_S390_EXEC_PROTECT */ +static noinline void do_fault_error(struct pt_regs *regs, long int_code, + unsigned long trans_exc_code, int fault) +{ + int si_code; + + switch (fault) { + case VM_FAULT_BADACCESS: +#ifdef CONFIG_S390_EXEC_PROTECT + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && + (trans_exc_code & 3) == 0) { + signal_return(regs, int_code, trans_exc_code); + break; + } +#endif /* CONFIG_S390_EXEC_PROTECT */ + case VM_FAULT_BADMAP: + /* Bad memory access. Check if it is kernel or user space. */ + if (regs->psw.mask & PSW_MASK_PSTATE) { + /* User mode accesses just cause a SIGSEGV */ + si_code = (fault == VM_FAULT_BADMAP) ? + SEGV_MAPERR : SEGV_ACCERR; + do_sigsegv(regs, int_code, si_code, trans_exc_code); + return; + } + case VM_FAULT_BADCONTEXT: + do_no_context(regs, int_code, trans_exc_code); + break; + default: /* fault & VM_FAULT_ERROR */ + if (fault & VM_FAULT_OOM) { + /* Kernel mode? Handle exceptions or die */ + if (!(regs->psw.mask & PSW_MASK_PSTATE)) + do_no_context(regs, int_code, trans_exc_code); + else + pagefault_out_of_memory(); + } else if (fault & VM_FAULT_SIGBUS) { + /* Kernel mode? Handle exceptions or die */ + if (!(regs->psw.mask & PSW_MASK_PSTATE)) + do_no_context(regs, int_code, trans_exc_code); + else + do_sigbus(regs, int_code, trans_exc_code); + } else + BUG(); + break; + } +} + /* * This routine handles page faults. It determines the address, * and the problem, and then passes it off to one of the appropriate * routines. * - * error_code: + * interruption code (int_code): * 04 Protection -> Write-Protection (suprression) * 10 Segment translation -> Not present (nullification) * 11 Page translation -> Not present (nullification) * 3b Region third trans. -> Not present (nullification) */ -static inline void -do_exception(struct pt_regs *regs, unsigned long error_code, int write) +static inline int do_exception(struct pt_regs *regs, int access, + unsigned long trans_exc_code) { struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; + unsigned int flags = 0; unsigned long address; - int space; - int si_code; int fault; - if (notify_page_fault(regs, error_code)) - return; + if (notify_page_fault(regs)) + return 0; tsk = current; mm = tsk->mm; - /* get the failing address and the affected space */ - address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; - space = check_space(tsk); - /* * Verify that the fault happened in user space, that * we are not in an interrupt and that there is a * user context. */ - if (unlikely(space == 0 || in_atomic() || !mm)) - goto no_context; + fault = VM_FAULT_BADCONTEXT; + if (unlikely(!user_space_fault(trans_exc_code) || in_atomic() || !mm)) + goto out; + flags |= FAULT_FLAG_USER; + + if (access == VM_WRITE) + flags |= FAULT_FLAG_WRITE; + + address = trans_exc_code & __FAIL_ADDR_MASK; /* * When we get here, the fault happened in the current * task's user address space, so we can switch on the * interrupts again and then search the VMAs */ local_irq_enable(); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); down_read(&mm->mmap_sem); - si_code = SEGV_MAPERR; + fault = VM_FAULT_BADMAP; vma = find_vma(mm, address); if (!vma) - goto bad_area; + goto out_up; -#ifdef CONFIG_S390_EXEC_PROTECT - if (unlikely((space == 2) && !(vma->vm_flags & VM_EXEC))) - if (!signal_return(mm, regs, address, error_code)) - /* - * signal_return() has done an up_read(&mm->mmap_sem) - * if it returns 0. - */ - return; -#endif - - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - if (expand_stack(vma, address)) - goto bad_area; -/* - * Ok, we have a good vm_area for this memory access, so - * we can handle it.. - */ -good_area: - si_code = SEGV_ACCERR; - if (!write) { - /* page not present, check vm flags */ - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - } else { - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; + if (unlikely(vma->vm_start > address)) { + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto out_up; + if (expand_stack(vma, address)) + goto out_up; } + /* + * Ok, we have a good vm_area for this memory access, so + * we can handle it.. + */ + fault = VM_FAULT_BADACCESS; + if (unlikely(!(vma->vm_flags & access))) + goto out_up; + if (is_vm_hugetlb_page(vma)) address &= HPAGE_MASK; /* @@ -352,114 +366,95 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault. */ - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); - if (unlikely(fault & VM_FAULT_ERROR)) { - if (fault & VM_FAULT_OOM) { - up_read(&mm->mmap_sem); - pagefault_out_of_memory(); - return; - } else if (fault & VM_FAULT_SIGBUS) { - do_sigbus(regs, error_code, address); - return; - } - BUG(); - } + fault = handle_mm_fault(mm, vma, address, flags); + if (unlikely(fault & VM_FAULT_ERROR)) + goto out_up; + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address); } else { tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address); } - up_read(&mm->mmap_sem); /* * The instruction that caused the program check will * be repeated. Don't signal single step via SIGTRAP. */ clear_tsk_thread_flag(tsk, TIF_SINGLE_STEP); - return; - -/* - * Something tried to access memory that isn't in our memory map.. - * Fix it, but check if it's kernel or user first.. - */ -bad_area: + fault = 0; +out_up: up_read(&mm->mmap_sem); - - /* User mode accesses just cause a SIGSEGV */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - tsk->thread.prot_addr = address; - tsk->thread.trap_no = error_code; - do_sigsegv(regs, error_code, si_code, address); - return; - } - -no_context: - do_no_context(regs, error_code, address); +out: + return fault; } -void __kprobes do_protection_exception(struct pt_regs *regs, - long error_code) +void __kprobes do_protection_exception(struct pt_regs *regs, long int_code) { - /* Protection exception is supressing, decrement psw address. */ - regs->psw.addr -= (error_code >> 16); + unsigned long trans_exc_code = S390_lowcore.trans_exc_code; + int fault; + + /* + * Protection exceptions are suppressing, decrement psw address. + * The exception to this rule are aborted transactions, for these + * the PSW already points to the correct location. + */ + if (!(int_code & 0x200)) + regs->psw.addr -= (int_code >> 16); /* * Check for low-address protection. This needs to be treated * as a special case because the translation exception code * field is not guaranteed to contain valid data in this case. */ - if (unlikely(!(S390_lowcore.trans_exc_code & 4))) { - do_low_address(regs, error_code); + if (unlikely(!(trans_exc_code & 4))) { + do_low_address(regs, int_code, trans_exc_code); return; } - do_exception(regs, 4, 1); + fault = do_exception(regs, VM_WRITE, trans_exc_code); + if (unlikely(fault)) + do_fault_error(regs, 4, trans_exc_code, fault); } -void __kprobes do_dat_exception(struct pt_regs *regs, long error_code) +void __kprobes do_dat_exception(struct pt_regs *regs, long int_code) { - do_exception(regs, error_code & 0xff, 0); -} + unsigned long trans_exc_code = S390_lowcore.trans_exc_code; + int access, fault; -#ifdef CONFIG_64BIT -void __kprobes do_asce_exception(struct pt_regs *regs, unsigned long error_code) -{ - struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned long address; - int space; - - mm = current->mm; - address = S390_lowcore.trans_exc_code & __FAIL_ADDR_MASK; - space = check_space(current); - - if (unlikely(space == 0 || in_atomic() || !mm)) - goto no_context; - - local_irq_enable(); - - down_read(&mm->mmap_sem); - vma = find_vma(mm, address); - up_read(&mm->mmap_sem); - - if (vma) { - update_mm(mm, current); - return; - } - - /* User mode accesses just cause a SIGSEGV */ - if (regs->psw.mask & PSW_MASK_PSTATE) { - current->thread.prot_addr = address; - current->thread.trap_no = error_code; - do_sigsegv(regs, error_code, SEGV_MAPERR, address); - return; + access = VM_READ | VM_EXEC | VM_WRITE; +#ifdef CONFIG_S390_EXEC_PROTECT + if ((regs->psw.mask & PSW_MASK_ASC) == PSW_ASC_SECONDARY && + (trans_exc_code & 3) == 0) + access = VM_EXEC; +#endif + fault = do_exception(regs, access, trans_exc_code); + if (unlikely(fault)) + do_fault_error(regs, int_code & 255, trans_exc_code, fault); +} + +int __handle_fault(unsigned long uaddr, unsigned long int_code, int write_user) +{ + struct pt_regs regs; + int access, fault; + + regs.psw.mask = psw_kernel_bits; + if (!irqs_disabled()) + regs.psw.mask |= PSW_MASK_IO | PSW_MASK_EXT; + regs.psw.addr = (unsigned long) __builtin_return_address(0); + regs.psw.addr |= PSW_ADDR_AMODE; + uaddr &= PAGE_MASK; + access = write_user ? VM_WRITE : VM_READ; + fault = do_exception(®s, access, uaddr | 2); + if (unlikely(fault)) { + if (fault & VM_FAULT_OOM) { + pagefault_out_of_memory(); + fault = 0; + } else if (fault & VM_FAULT_SIGBUS) + do_sigbus(®s, int_code, uaddr); } - -no_context: - do_no_context(regs, error_code, address); + return fault ? -EFAULT : 0; } -#endif #ifdef CONFIG_PFAULT /* @@ -490,7 +485,7 @@ typedef struct { int pfault_init(void) { pfault_refbk_t refbk = - { 0x258, 0, 5, 2, __LC_CURRENT, 1ULL << 48, 1ULL << 48, + { 0x258, 0, 5, 2, __LC_CURRENT_PID, 1ULL << 48, 1ULL << 48, __PF_RES_FIELD }; int rc; @@ -522,10 +517,15 @@ void pfault_fini(void) : : "a" (&refbk), "m" (refbk) : "cc"); } -static void pfault_interrupt(__u16 error_code) +static DEFINE_SPINLOCK(pfault_lock); +static LIST_HEAD(pfault_list); + +static void pfault_interrupt(__u16 int_code) { + struct thread_info *info; struct task_struct *tsk; __u16 subcode; + pid_t pid; /* * Get the external interruption subcode & pfault @@ -536,40 +536,91 @@ static void pfault_interrupt(__u16 error subcode = S390_lowcore.cpu_addr; if ((subcode & 0xff00) != __SUBCODE_MASK) return; - - /* - * Get the token (= address of the task structure of the affected task). - */ - tsk = *(struct task_struct **) __LC_PFAULT_INTPARM; - + if (subcode & 0x0080) { + /* Get the token (= pid of the affected task). */ + pid = (pid_t)(*(unsigned long *)__LC_PFAULT_INTPARM); + rcu_read_lock(); + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return; + } else { + tsk = current; + } + spin_lock(&pfault_lock); if (subcode & 0x0080) { /* signal bit is set -> a page has been swapped in by VM */ - if (xchg(&tsk->thread.pfault_wait, -1) != 0) { + if (tsk->thread.pfault_wait == 1) { /* Initial interrupt was faster than the completion * interrupt. pfault_wait is valid. Set pfault_wait * back to zero and wake up the process. This can * safely be done because the task is still sleeping * and can't produce new pfaults. */ tsk->thread.pfault_wait = 0; + info = task_thread_info(tsk); + list_del(&info->list); wake_up_process(tsk); put_task_struct(tsk); + } else { + /* Completion interrupt was faster than initial + * interrupt. Set pfault_wait to -1 so the initial + * interrupt doesn't put the task to sleep. */ + tsk->thread.pfault_wait = -1; } + put_task_struct(tsk); } else { /* signal bit not set -> a real page is missing. */ - get_task_struct(tsk); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (xchg(&tsk->thread.pfault_wait, 1) != 0) { + if (tsk->thread.pfault_wait == 1) { + /* Already on the list with a reference: put to sleep */ + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + set_tsk_need_resched(tsk); + } else if (tsk->thread.pfault_wait == -1) { /* Completion interrupt was faster than the initial - * interrupt (swapped in a -1 for pfault_wait). Set - * pfault_wait back to zero and exit. This can be - * done safely because tsk is running in kernel - * mode and can't produce new pfaults. */ + * interrupt (pfault_wait == -1). Set pfault_wait + * back to zero and exit. */ tsk->thread.pfault_wait = 0; - set_task_state(tsk, TASK_RUNNING); - put_task_struct(tsk); - } else + } else { + /* Initial interrupt arrived before completion + * interrupt. Let the task sleep. + * An extra task reference is needed since a different + * cpu may set the task state to TASK_RUNNING again + * before the scheduler is reached. */ + get_task_struct(tsk); + tsk->thread.pfault_wait = 1; + info = task_thread_info(tsk); + list_add(&info->list, &pfault_list); + set_task_state(tsk, TASK_UNINTERRUPTIBLE); set_tsk_need_resched(tsk); + } } + spin_unlock(&pfault_lock); +} + +static int __cpuinit pfault_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct thread_info *info, *next; + struct task_struct *tsk; + + switch (action) { + case CPU_DEAD: + case CPU_DEAD_FROZEN: + spin_lock_irq(&pfault_lock); + list_for_each_entry_safe(info, next, &pfault_list, list) { + tsk = info->task; + tsk->thread.pfault_wait = 0; + list_del(&info->list); + wake_up_process(tsk); + put_task_struct(tsk); + } + spin_unlock_irq(&pfault_lock); + break; + default: + break; + } + return NOTIFY_OK; } void __init pfault_irq_init(void) @@ -584,8 +635,10 @@ void __init pfault_irq_init(void) &ext_int_pfault) != 0) panic("Couldn't request external interrupt 0x2603"); - if (pfault_init() == 0) + if (pfault_init() == 0) { + hotcpu_notifier(pfault_cpu_notify, 0); return; + } /* Tough luck, no pfault. */ pfault_disable = 1; diff -uprN linux-2.6.32/arch/s390/mm/hugetlbpage.c linux-2.6.32.ovz/arch/s390/mm/hugetlbpage.c --- linux-2.6.32/arch/s390/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/hugetlbpage.c 2015-03-10 13:23:25.000000000 -0700 @@ -68,12 +68,15 @@ void arch_release_hugepage(struct page * ptep = (pte_t *) page[1].index; if (!ptep) return; + clear_table((unsigned long *) ptep, _PAGE_TYPE_EMPTY, + PTRS_PER_PTE * sizeof(pte_t)); pte_free(&init_mm, ptep); page[1].index = 0; } pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) + unsigned long addr, unsigned long sz, + bool *shared) { pgd_t *pgdp; pud_t *pudp; diff -uprN linux-2.6.32/arch/s390/mm/init.c linux-2.6.32.ovz/arch/s390/mm/init.c --- linux-2.6.32/arch/s390/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/init.c 2015-03-10 13:23:58.000000000 -0700 @@ -51,18 +51,22 @@ void __init paging_init(void) { static const int ssm_mask = 0x04000000L; unsigned long max_zone_pfns[MAX_NR_ZONES]; - unsigned long pgd_type; + unsigned long pgd_type, asce_bits; init_mm.pgd = swapper_pg_dir; - S390_lowcore.kernel_asce = __pa(init_mm.pgd) & PAGE_MASK; #ifdef CONFIG_64BIT - /* A three level page table (4TB) is enough for the kernel space. */ - S390_lowcore.kernel_asce |= _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; - pgd_type = _REGION3_ENTRY_EMPTY; + if (VMALLOC_END > (1UL << 42)) { + asce_bits = _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + pgd_type = _REGION2_ENTRY_EMPTY; + } else { + asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + pgd_type = _REGION3_ENTRY_EMPTY; + } #else - S390_lowcore.kernel_asce |= _ASCE_TABLE_LENGTH; + asce_bits = _ASCE_TABLE_LENGTH; pgd_type = _SEGMENT_ENTRY_EMPTY; #endif + S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; clear_table((unsigned long *) init_mm.pgd, pgd_type, sizeof(unsigned long)*2048); vmem_map_init(); @@ -73,6 +77,8 @@ void __init paging_init(void) __ctl_load(S390_lowcore.kernel_asce, 13, 13); __raw_local_irq_ssm(ssm_mask); + atomic_set(&init_mm.context.attach_count, 1); + sparse_memory_present_with_active_regions(MAX_NUMNODES); sparse_init(); memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); @@ -176,16 +182,38 @@ void free_initrd_mem(unsigned long start #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size) { - struct pglist_data *pgdat; + unsigned long zone_start_pfn, zone_end_pfn, nr_pages; + unsigned long start_pfn = PFN_DOWN(start); + unsigned long size_pages = PFN_DOWN(size); struct zone *zone; int rc; - pgdat = NODE_DATA(nid); - zone = pgdat->node_zones + ZONE_MOVABLE; rc = vmem_add_mapping(start, size); if (rc) return rc; - rc = __add_pages(nid, zone, PFN_DOWN(start), PFN_DOWN(size)); + for_each_zone(zone) { + if (zone_idx(zone) != ZONE_MOVABLE) { + /* Add range within existing zone limits */ + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone->zone_start_pfn + + zone->spanned_pages; + } else { + /* Add remaining range to ZONE_MOVABLE */ + zone_start_pfn = start_pfn; + zone_end_pfn = start_pfn + size_pages; + } + if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) + continue; + nr_pages = (start_pfn + size_pages > zone_end_pfn) ? + zone_end_pfn - start_pfn : size_pages; + rc = __add_pages(nid, zone, start_pfn, nr_pages); + if (rc) + break; + start_pfn += nr_pages; + size_pages -= nr_pages; + if (!size_pages) + break; + } if (rc) vmem_remove_mapping(start, size); return rc; diff -uprN linux-2.6.32/arch/s390/mm/maccess.c linux-2.6.32.ovz/arch/s390/mm/maccess.c --- linux-2.6.32/arch/s390/mm/maccess.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/maccess.c 2015-03-10 13:23:55.000000000 -0700 @@ -11,6 +11,9 @@ #include #include #include +#include +#include +#include #include /* @@ -59,3 +62,168 @@ long probe_kernel_write(void *dst, void } return copied < 0 ? -EFAULT : 0; } + +/* + * Copy memory in real mode (kernel to kernel) + */ +int memcpy_real(void *dest, void *src, size_t count) +{ + register unsigned long _dest asm("2") = (unsigned long) dest; + register unsigned long _len1 asm("3") = (unsigned long) count; + register unsigned long _src asm("4") = (unsigned long) src; + register unsigned long _len2 asm("5") = (unsigned long) count; + unsigned long flags; + int rc = -EFAULT; + + if (!count) + return 0; + flags = __raw_local_irq_stnsm(0xf8UL); + asm volatile ( + "0: mvcle %1,%2,0x0\n" + "1: jo 0b\n" + " lhi %0,0x0\n" + "2:\n" + EX_TABLE(1b,2b) + : "+d" (rc), "+d" (_dest), "+d" (_src), "+d" (_len1), + "+d" (_len2), "=m" (*((long *) dest)) + : "m" (*((long *) src)) + : "cc", "memory"); + __raw_local_irq_ssm(flags); + return rc; +} + +/* + * Copy memory to absolute zero + */ +void copy_to_absolute_zero(void *dest, void *src, size_t count) +{ + unsigned long cr0; + + BUG_ON((unsigned long) dest + count >= sizeof(struct _lowcore)); + preempt_disable(); + __ctl_store(cr0, 0, 0); + __ctl_clear_bit(0, 28); /* disable lowcore protection */ + memcpy_real(dest + store_prefix(), src, count); + __ctl_load(cr0, 0, 0); + preempt_enable(); +} + +/* + * Copy memory from kernel (real) to user (virtual) + */ +int copy_to_user_real(void __user *dest, void *src, size_t count) +{ + int offs = 0, size, rc; + char *buf; + + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + return -ENOMEM; + rc = -EFAULT; + while (offs < count) { + size = min(PAGE_SIZE, count - offs); + if (memcpy_real(buf, src + offs, size)) + goto out; + if (copy_to_user(dest + offs, buf, size)) + goto out; + offs += size; + } + rc = 0; +out: + free_page((unsigned long) buf); + return rc; +} + +/* + * Copy memory from user (virtual) to kernel (real) + */ +int copy_from_user_real(void *dest, void __user *src, size_t count) +{ + int offs = 0, size, rc; + char *buf; + + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + return -ENOMEM; + rc = -EFAULT; + while (offs < count) { + size = min(PAGE_SIZE, count - offs); + if (copy_from_user(buf, src + offs, size)) + goto out; + if (memcpy_real(dest + offs, buf, size)) + goto out; + offs += size; + } + rc = 0; +out: + free_page((unsigned long) buf); + return rc; +} + +/* + * Check if physical address is within prefix or zero page + */ +static int is_swapped(unsigned long addr) +{ + unsigned long lc; + int cpu; + + if (addr < sizeof(struct _lowcore)) + return 1; + for_each_online_cpu(cpu) { + lc = (unsigned long) lowcore_ptr[cpu]; + if (addr > lc + sizeof(struct _lowcore) - 1 || addr < lc) + continue; + return 1; + } + return 0; +} + +/* + * Return swapped prefix or zero page address + */ +static unsigned long get_swapped(unsigned long addr) +{ + unsigned long prefix = store_prefix(); + + if (addr < sizeof(struct _lowcore)) + return addr + prefix; + if (addr >= prefix && addr < prefix + sizeof(struct _lowcore)) + return addr - prefix; + return addr; +} + +/* + * Convert a physical pointer for /dev/mem access + * + * For swapped prefix pages a new buffer is returned that contains a copy of + * the absolute memory. The buffer size is maximum one page large. + */ +void *xlate_dev_mem_ptr(unsigned long addr) +{ + void *bounce = (void *) addr; + unsigned long size; + + get_online_cpus(); + preempt_disable(); + if (is_swapped(addr)) { + size = PAGE_SIZE - (addr & ~PAGE_MASK); + bounce = (void *) __get_free_page(GFP_ATOMIC); + if (bounce) + memcpy_real(bounce, (void *) get_swapped(addr), size); + } + preempt_enable(); + put_online_cpus(); + return bounce; +} +EXPORT_SYMBOL_GPL(xlate_dev_mem_ptr); + +/* + * Free converted buffer for /dev/mem access (if necessary) + */ +void unxlate_dev_mem_ptr(unsigned long addr, void *buf) +{ + if ((void *) addr != buf) + free_page((unsigned long) buf); +} +EXPORT_SYMBOL_GPL(unxlate_dev_mem_ptr); diff -uprN linux-2.6.32/arch/s390/mm/mmap.c linux-2.6.32.ovz/arch/s390/mm/mmap.c --- linux-2.6.32/arch/s390/mm/mmap.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/mmap.c 2015-03-10 13:24:05.000000000 -0700 @@ -26,18 +26,46 @@ #include #include +#include #include +#include #include #include +static unsigned long stack_maxrandom_size(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + if (current->personality & ADDR_NO_RANDOMIZE) + return 0; + return STACK_RND_MASK << PAGE_SHIFT; +} + /* * Top of mmap area (just below the process stack). * - * Leave an at least ~128 MB hole. + * Leave at least a ~32 MB hole. */ -#define MIN_GAP (128*1024*1024) +#define MIN_GAP (32*1024*1024) #define MAX_GAP (STACK_TOP/6*5) +static inline int mmap_is_legacy(void) +{ + if (current->personality & ADDR_COMPAT_LAYOUT) + return 1; + if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) + return 1; + return sysctl_legacy_va_layout; +} + +static unsigned long mmap_rnd(void) +{ + if (!(current->flags & PF_RANDOMIZE)) + return 0; + /* 8MB randomization for mmap_base */ + return (get_random_int() & 0x7ffUL) << PAGE_SHIFT; +} + static inline unsigned long mmap_base(void) { unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; @@ -46,22 +74,8 @@ static inline unsigned long mmap_base(vo gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; - - return STACK_TOP - (gap & PAGE_MASK); -} - -static inline int mmap_is_legacy(void) -{ -#ifdef CONFIG_64BIT - /* - * Force standard allocation for 64 bit programs. - */ - if (!is_compat_task()) - return 1; -#endif - return sysctl_legacy_va_layout || - (current->personality & ADDR_COMPAT_LAYOUT) || - current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY; + gap &= PAGE_MASK; + return STACK_TOP - stack_maxrandom_size() - mmap_rnd() - gap; } #ifndef CONFIG_64BIT @@ -90,10 +104,13 @@ EXPORT_SYMBOL_GPL(arch_pick_mmap_layout) #else -int s390_mmap_check(unsigned long addr, unsigned long len) +int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) { - if (!is_compat_task() && - len >= TASK_SIZE && TASK_SIZE < (1UL << 53)) + if (is_compat_task() || (TASK_SIZE >= (1UL << 53))) + return 0; + if (!(flags & MAP_FIXED)) + addr = 0; + if ((addr + len) >= TASK_SIZE) return crst_table_upgrade(current->mm, 1UL << 53); return 0; } diff -uprN linux-2.6.32/arch/s390/mm/page-states.c linux-2.6.32.ovz/arch/s390/mm/page-states.c --- linux-2.6.32/arch/s390/mm/page-states.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/page-states.c 2015-03-10 13:24:07.000000000 -0700 @@ -11,6 +11,8 @@ #include #include #include +#include +#include #define ESSA_SET_STABLE 1 #define ESSA_SET_UNUSED 2 @@ -40,6 +42,14 @@ void __init cmma_init(void) if (!cmma_flag) return; + /* + * Disable CMM for dump, otherwise the tprot based memory + * detection can fail because of unstable pages. + */ + if (OLDMEM_BASE || ipl_info.type == IPL_TYPE_FCP_DUMP) { + cmma_flag = 0; + return; + } asm volatile( " .insn rrf,0xb9ab0000,%1,%1,0,0\n" "0: la %0,0\n" diff -uprN linux-2.6.32/arch/s390/mm/pgtable.c linux-2.6.32.ovz/arch/s390/mm/pgtable.c --- linux-2.6.32/arch/s390/mm/pgtable.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/pgtable.c 2015-03-10 13:24:05.000000000 -0700 @@ -51,17 +51,6 @@ void clear_table_pgstes(unsigned long *t #endif -unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE; -EXPORT_SYMBOL(VMALLOC_START); - -static int __init parse_vmalloc(char *arg) -{ - if (!arg) - return -EINVAL; - VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK; - return 0; -} -early_param("vmalloc", parse_vmalloc); unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec) { @@ -98,12 +87,23 @@ void crst_table_free(struct mm_struct *m } #ifdef CONFIG_64BIT +static void __crst_table_upgrade(void *arg) +{ + struct mm_struct *mm = arg; + + if (current->active_mm == mm) + update_mm(mm, current); + __tlb_flush_local(); +} + int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) { unsigned long *table, *pgd; unsigned long entry; + int flush; BUG_ON(limit > (1UL << 53)); + flush = 0; repeat: table = crst_table_alloc(mm, mm->context.noexec); if (!table) @@ -129,13 +129,15 @@ repeat: mm->pgd = (pgd_t *) table; mm->task_size = mm->context.asce_limit; table = NULL; + flush = 1; } spin_unlock(&mm->page_table_lock); if (table) crst_table_free(mm, table); if (mm->context.asce_limit < limit) goto repeat; - update_mm(mm, current); + if (flush) + on_each_cpu(__crst_table_upgrade, mm, 0); return 0; } @@ -143,9 +145,8 @@ void crst_table_downgrade(struct mm_stru { pgd_t *pgd; - if (mm->context.asce_limit <= limit) - return; - __tlb_flush_mm(mm); + if (current->active_mm == mm) + __tlb_flush_mm(mm); while (mm->context.asce_limit > limit) { pgd = mm->pgd; switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { @@ -168,7 +169,8 @@ void crst_table_downgrade(struct mm_stru mm->task_size = mm->context.asce_limit; crst_table_free(mm, (unsigned long *) pgd); } - update_mm(mm, current); + if (current->active_mm == mm) + update_mm(mm, current); } #endif @@ -269,7 +271,7 @@ int s390_enable_sie(void) struct mm_struct *mm, *old_mm; /* Do we have switched amode? If no, we cannot do sie */ - if (!switch_amode) + if (user_mode == HOME_SPACE_MODE) return -EINVAL; /* Do we have pgstes? if yes, we are done */ diff -uprN linux-2.6.32/arch/s390/mm/vmem.c linux-2.6.32.ovz/arch/s390/mm/vmem.c --- linux-2.6.32/arch/s390/mm/vmem.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/s390/mm/vmem.c 2015-03-10 13:24:03.000000000 -0700 @@ -172,9 +172,9 @@ static void vmem_remove_range(unsigned l /* * Add a backed mem_map array to the virtual mem_map array. */ -int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node) +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { - unsigned long address, start_addr, end_addr; + unsigned long address = start; pgd_t *pg_dir; pud_t *pu_dir; pmd_t *pm_dir; @@ -182,10 +182,7 @@ int __meminit vmemmap_populate(struct pa pte_t pte; int ret = -ENOMEM; - start_addr = (unsigned long) start; - end_addr = (unsigned long) (start + nr); - - for (address = start_addr; address < end_addr; address += PAGE_SIZE) { + for (address = start; address < end; address += PAGE_SIZE) { pg_dir = pgd_offset_k(address); if (pgd_none(*pg_dir)) { pu_dir = vmem_pud_alloc(); @@ -221,10 +218,10 @@ int __meminit vmemmap_populate(struct pa *pt_dir = pte; } } - memset(start, 0, nr * sizeof(struct page)); + memset((void *)start, 0, end - start); ret = 0; out: - flush_tlb_kernel_range(start_addr, end_addr); + flush_tlb_kernel_range(start, end); return ret; } @@ -338,6 +335,9 @@ void __init vmem_map_init(void) ro_start = ((unsigned long)&_stext) & PAGE_MASK; ro_end = PFN_ALIGN((unsigned long)&_eshared); for (i = 0; i < MEMORY_CHUNKS && memory_chunk[i].size > 0; i++) { + if (memory_chunk[i].type == CHUNK_CRASHK || + memory_chunk[i].type == CHUNK_OLDMEM) + continue; start = memory_chunk[i].addr; end = memory_chunk[i].addr + memory_chunk[i].size; if (start >= ro_end || end <= ro_start) @@ -371,6 +371,9 @@ static int __init vmem_convert_memory_ch for (i = 0; i < MEMORY_CHUNKS; i++) { if (!memory_chunk[i].size) continue; + if (memory_chunk[i].type == CHUNK_CRASHK || + memory_chunk[i].type == CHUNK_OLDMEM) + continue; seg = kzalloc(sizeof(*seg), GFP_KERNEL); if (!seg) panic("Out of memory...\n"); diff -uprN linux-2.6.32/arch/score/kernel/sys_score.c linux-2.6.32.ovz/arch/score/kernel/sys_score.c --- linux-2.6.32/arch/score/kernel/sys_score.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/score/kernel/sys_score.c 2015-03-10 13:23:54.000000000 -0700 @@ -36,34 +36,16 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) { - int error = -EBADF; - struct file *file = NULL; - - if (pgoff & (~PAGE_MASK >> 12)) - return -EINVAL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return error; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - - return error; + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, unsigned long fd, off_t pgoff) + unsigned long flags, unsigned long fd, off_t offset) { - return sys_mmap2(addr, len, prot, flags, fd, pgoff >> PAGE_SHIFT); + if (unlikely(offset & ~PAGE_MASK)) + return -EINVAL; + return sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); } asmlinkage long diff -uprN linux-2.6.32/arch/score/mm/init.c linux-2.6.32.ovz/arch/score/mm/init.c --- linux-2.6.32/arch/score/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/score/mm/init.c 2015-03-10 13:21:18.000000000 -0700 @@ -59,7 +59,7 @@ static unsigned long setup_zero_page(voi } #ifndef CONFIG_NEED_MULTIPLE_NODES -static int __init page_is_ram(unsigned long pagenr) +int page_is_ram(unsigned long pagenr) { if (pagenr >= min_low_pfn && pagenr < max_low_pfn) return 1; diff -uprN linux-2.6.32/arch/sh/include/asm/mmzone.h linux-2.6.32.ovz/arch/sh/include/asm/mmzone.h --- linux-2.6.32/arch/sh/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/include/asm/mmzone.h 2015-03-10 13:22:35.000000000 -0700 @@ -9,10 +9,6 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ - NODE_DATA(nid)->node_spanned_pages) - static inline int pfn_to_nid(unsigned long pfn) { int nid; diff -uprN linux-2.6.32/arch/sh/include/asm/unistd_32.h linux-2.6.32.ovz/arch/sh/include/asm/unistd_32.h --- linux-2.6.32/arch/sh/include/asm/unistd_32.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/include/asm/unistd_32.h 2015-03-10 13:21:18.000000000 -0700 @@ -350,6 +350,8 @@ #ifdef __KERNEL__ +#define __IGNORE_recvmmsg + #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT diff -uprN linux-2.6.32/arch/sh/include/asm/unistd_64.h linux-2.6.32.ovz/arch/sh/include/asm/unistd_64.h --- linux-2.6.32/arch/sh/include/asm/unistd_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/include/asm/unistd_64.h 2015-03-10 13:21:18.000000000 -0700 @@ -385,10 +385,11 @@ #define __NR_pwritev 362 #define __NR_rt_tgsigqueueinfo 363 #define __NR_perf_event_open 364 +#define __NR_recvmmsg 365 #ifdef __KERNEL__ -#define NR_syscalls 365 +#define NR_syscalls 366 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff -uprN linux-2.6.32/arch/sh/kernel/cpu/sh4/sq.c linux-2.6.32.ovz/arch/sh/kernel/cpu/sh4/sq.c --- linux-2.6.32/arch/sh/kernel/cpu/sh4/sq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/kernel/cpu/sh4/sq.c 2015-03-10 13:21:34.000000000 -0700 @@ -327,7 +327,7 @@ static struct attribute *sq_sysfs_attrs[ NULL, }; -static struct sysfs_ops sq_sysfs_ops = { +static const struct sysfs_ops sq_sysfs_ops = { .show = sq_sysfs_show, .store = sq_sysfs_store, }; diff -uprN linux-2.6.32/arch/sh/kernel/process_64.c linux-2.6.32.ovz/arch/sh/kernel/process_64.c --- linux-2.6.32/arch/sh/kernel/process_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/kernel/process_64.c 2015-03-10 13:23:54.000000000 -0700 @@ -367,7 +367,7 @@ void exit_thread(void) void flush_thread(void) { - /* Called by fs/exec.c (flush_old_exec) to remove traces of a + /* Called by fs/exec.c (setup_new_exec) to remove traces of a * previously running executable. */ #ifdef CONFIG_SH_FPU if (last_task_used_math == current) { diff -uprN linux-2.6.32/arch/sh/kernel/setup.c linux-2.6.32.ovz/arch/sh/kernel/setup.c --- linux-2.6.32/arch/sh/kernel/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/kernel/setup.c 2015-06-23 04:16:16.000000000 -0700 @@ -155,7 +155,7 @@ static void __init reserve_crashkernel(v free_mem = ((unsigned long long)max_low_pfn - min_low_pfn) << PAGE_SHIFT; ret = parse_crashkernel(boot_command_line, free_mem, - &crash_size, &crash_base); + &crash_size, &crash_base, NULL); if (ret == 0 && crash_size) { if (crash_base <= 0) { vp = alloc_bootmem_nopanic(crash_size); diff -uprN linux-2.6.32/arch/sh/kernel/syscalls_64.S linux-2.6.32.ovz/arch/sh/kernel/syscalls_64.S --- linux-2.6.32/arch/sh/kernel/syscalls_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/kernel/syscalls_64.S 2015-03-10 13:21:18.000000000 -0700 @@ -391,3 +391,4 @@ sys_call_table: .long sys_pwritev .long sys_rt_tgsigqueueinfo .long sys_perf_event_open + .long sys_recvmmsg /* 365 */ diff -uprN linux-2.6.32/arch/sh/kernel/sys_sh.c linux-2.6.32.ovz/arch/sh/kernel/sys_sh.c --- linux-2.6.32/arch/sh/kernel/sys_sh.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/kernel/sys_sh.c 2015-03-10 13:21:11.000000000 -0700 @@ -28,37 +28,13 @@ #include #include -static inline long -do_mmap2(unsigned long addr, unsigned long len, unsigned long prot, - unsigned long flags, int fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage int old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, int fd, unsigned long off) { if (off & ~PAGE_MASK) return -EINVAL; - return do_mmap2(addr, len, prot, flags, fd, off>>PAGE_SHIFT); + return sys_mmap_pgoff(addr, len, prot, flags, fd, off>>PAGE_SHIFT); } asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, @@ -74,7 +50,7 @@ asmlinkage long sys_mmap2(unsigned long pgoff >>= PAGE_SHIFT - 12; - return do_mmap2(addr, len, prot, flags, fd, pgoff); + return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } /* diff -uprN linux-2.6.32/arch/sh/kernel/vsyscall/vsyscall.c linux-2.6.32.ovz/arch/sh/kernel/vsyscall/vsyscall.c --- linux-2.6.32/arch/sh/kernel/vsyscall/vsyscall.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/kernel/vsyscall/vsyscall.c 2015-03-10 13:23:27.000000000 -0700 @@ -74,8 +74,7 @@ int arch_setup_additional_pages(struct l ret = install_special_mapping(mm, addr, PAGE_SIZE, VM_READ | VM_EXEC | - VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | - VM_ALWAYSDUMP, + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, syscall_pages); if (unlikely(ret)) goto up_fail; @@ -95,17 +94,17 @@ const char *arch_vma_name(struct vm_area return NULL; } -struct vm_area_struct *get_gate_vma(struct task_struct *task) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; } -int in_gate_area(struct task_struct *task, unsigned long address) +int in_gate_area(struct mm_struct *mm, unsigned long address) { return 0; } -int in_gate_area_no_task(unsigned long address) +int in_gate_area_no_mm(unsigned long address) { return 0; } diff -uprN linux-2.6.32/arch/sh/mm/hugetlbpage.c linux-2.6.32.ovz/arch/sh/mm/hugetlbpage.c --- linux-2.6.32/arch/sh/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/mm/hugetlbpage.c 2015-03-10 13:23:25.000000000 -0700 @@ -23,7 +23,8 @@ #include pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) + unsigned long addr, unsigned long sz, + bool *shared) { pgd_t *pgd; pud_t *pud; diff -uprN linux-2.6.32/arch/sh/mm/mmap.c linux-2.6.32.ovz/arch/sh/mm/mmap.c --- linux-2.6.32/arch/sh/mm/mmap.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/mm/mmap.c 2015-03-10 13:21:11.000000000 -0700 @@ -54,7 +54,8 @@ unsigned long arch_get_unmapped_area(str /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && (addr & shm_align_mask)) + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) return -EINVAL; return addr; } diff -uprN linux-2.6.32/arch/sh/tools/Makefile linux-2.6.32.ovz/arch/sh/tools/Makefile --- linux-2.6.32/arch/sh/tools/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sh/tools/Makefile 2015-03-10 13:21:31.000000000 -0700 @@ -13,4 +13,4 @@ include/asm-sh/machtypes.h: $(src)/gen-mach-types $(src)/mach-types @echo ' Generating $@' $(Q)if [ ! -d include/asm-sh ]; then mkdir -p include/asm-sh; fi - $(Q)$(AWK) -f $^ > $@ || { rm -f $@; /bin/false; } + $(Q)LC_ALL=C $(AWK) -f $^ > $@ || { rm -f $@; /bin/false; } diff -uprN linux-2.6.32/arch/sparc/include/asm/errno.h linux-2.6.32.ovz/arch/sparc/include/asm/errno.h --- linux-2.6.32/arch/sparc/include/asm/errno.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/include/asm/errno.h 2015-03-10 13:24:05.000000000 -0700 @@ -40,7 +40,7 @@ #define EPROCLIM 67 /* SUNOS: Too many processes */ #define EUSERS 68 /* Too many users */ #define EDQUOT 69 /* Quota exceeded */ -#define ESTALE 70 /* Stale NFS file handle */ +#define ESTALE 70 /* Stale file handle */ #define EREMOTE 71 /* Object is remote */ #define ENOSTR 72 /* Device not a stream */ #define ETIME 73 /* Timer expired */ @@ -112,4 +112,6 @@ #define ERFKILL 134 /* Operation not possible due to RF-kill */ +#define EHWPOISON 135 /* Memory page has hardware error */ + #endif diff -uprN linux-2.6.32/arch/sparc/include/asm/mmzone.h linux-2.6.32.ovz/arch/sparc/include/asm/mmzone.h --- linux-2.6.32/arch/sparc/include/asm/mmzone.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/include/asm/mmzone.h 2015-03-10 13:22:35.000000000 -0700 @@ -8,8 +8,6 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn) extern int numa_cpu_lookup_table[]; extern cpumask_t numa_cpumask_lookup_table[]; diff -uprN linux-2.6.32/arch/sparc/include/asm/socket.h linux-2.6.32.ovz/arch/sparc/include/asm/socket.h --- linux-2.6.32/arch/sparc/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -56,6 +56,8 @@ #define SO_TIMESTAMPING 0x0023 #define SCM_TIMESTAMPING SO_TIMESTAMPING +#define SO_RXQ_OVFL 0x0024 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff -uprN linux-2.6.32/arch/sparc/include/asm/unistd.h linux-2.6.32.ovz/arch/sparc/include/asm/unistd.h --- linux-2.6.32/arch/sparc/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -396,8 +396,9 @@ #define __NR_pwritev 325 #define __NR_rt_tgsigqueueinfo 326 #define __NR_perf_event_open 327 +#define __NR_recvmmsg 328 -#define NR_SYSCALLS 328 +#define NR_SYSCALLS 329 #ifdef __32bit_syscall_numbers__ /* Sparc 32-bit only has the "setresuid32", "getresuid32" variants, diff -uprN linux-2.6.32/arch/sparc/kernel/ldc.c linux-2.6.32.ovz/arch/sparc/kernel/ldc.c --- linux-2.6.32/arch/sparc/kernel/ldc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/kernel/ldc.c 2015-03-10 13:21:04.000000000 -0700 @@ -1242,13 +1242,13 @@ int ldc_bind(struct ldc_channel *lp, con snprintf(lp->tx_irq_name, LDC_IRQ_NAME_MAX, "%s TX", name); err = request_irq(lp->cfg.rx_irq, ldc_rx, - IRQF_SAMPLE_RANDOM | IRQF_DISABLED | IRQF_SHARED, + IRQF_SAMPLE_RANDOM | IRQF_DISABLED, lp->rx_irq_name, lp); if (err) return err; err = request_irq(lp->cfg.tx_irq, ldc_tx, - IRQF_SAMPLE_RANDOM | IRQF_DISABLED | IRQF_SHARED, + IRQF_SAMPLE_RANDOM | IRQF_DISABLED, lp->tx_irq_name, lp); if (err) { free_irq(lp->cfg.rx_irq, lp); diff -uprN linux-2.6.32/arch/sparc/kernel/of_device_64.c linux-2.6.32.ovz/arch/sparc/kernel/of_device_64.c --- linux-2.6.32/arch/sparc/kernel/of_device_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/kernel/of_device_64.c 2015-03-10 13:21:04.000000000 -0700 @@ -104,9 +104,19 @@ static int of_bus_pci_map(u32 *addr, con int i; /* Check address type match */ - if ((addr[0] ^ range[0]) & 0x03000000) - return -EINVAL; + if (!((addr[0] ^ range[0]) & 0x03000000)) + goto type_match; + + /* Special exception, we can map a 64-bit address into + * a 32-bit range. + */ + if ((addr[0] & 0x03000000) == 0x03000000 && + (range[0] & 0x03000000) == 0x02000000) + goto type_match; + + return -EINVAL; +type_match: if (of_out_of_range(addr + 1, range + 1, range + na + pna, na - 1, ns)) return -EINVAL; diff -uprN linux-2.6.32/arch/sparc/kernel/sys_sparc_32.c linux-2.6.32.ovz/arch/sparc/kernel/sys_sparc_32.c --- linux-2.6.32/arch/sparc/kernel/sys_sparc_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/kernel/sys_sparc_32.c 2015-03-10 13:21:11.000000000 -0700 @@ -45,7 +45,8 @@ unsigned long arch_get_unmapped_area(str /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && (addr & (SHMLBA - 1))) + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; } @@ -79,15 +80,6 @@ unsigned long arch_get_unmapped_area(str } } -asmlinkage unsigned long sparc_brk(unsigned long brk) -{ - if(ARCH_SUN4C) { - if ((brk & 0xe0000000) != (current->mm->brk & 0xe0000000)) - return current->mm->brk; - } - return sys_brk(brk); -} - /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way unix traditionally does this, though. @@ -234,31 +226,6 @@ int sparc_mmap_check(unsigned long addr, } /* Linux version of mmap */ -static unsigned long do_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, unsigned long fd, - unsigned long pgoff) -{ - struct file * file = NULL; - unsigned long retval = -EBADF; - - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - len = PAGE_ALIGN(len); - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - - down_write(¤t->mm->mmap_sem); - retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return retval; -} asmlinkage unsigned long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, @@ -266,14 +233,16 @@ asmlinkage unsigned long sys_mmap2(unsig { /* Make sure the shift for mmap2 is constant (12), no matter what PAGE_SIZE we have. */ - return do_mmap2(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT - 12)); + return sys_mmap_pgoff(addr, len, prot, flags, fd, + pgoff >> (PAGE_SHIFT - 12)); } asmlinkage unsigned long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off) { - return do_mmap2(addr, len, prot, flags, fd, off >> PAGE_SHIFT); + /* no alignment check? */ + return sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); } long sparc_remap_file_pages(unsigned long start, unsigned long size, @@ -287,27 +256,6 @@ long sparc_remap_file_pages(unsigned lon (pgoff >> (PAGE_SHIFT - 12)), flags); } -extern unsigned long do_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr); - -asmlinkage unsigned long sparc_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) -{ - unsigned long ret = -EINVAL; - - if (unlikely(sparc_mmap_check(addr, old_len))) - goto out; - if (unlikely(sparc_mmap_check(new_addr, new_len))) - goto out; - down_write(¤t->mm->mmap_sem); - ret = do_mremap(addr, old_len, new_len, flags, new_addr); - up_write(¤t->mm->mmap_sem); -out: - return ret; -} - /* we come to here via sys_nis_syscall so it can setup the regs argument */ asmlinkage unsigned long c_sys_nis_syscall (struct pt_regs *regs) diff -uprN linux-2.6.32/arch/sparc/kernel/sys_sparc32.c linux-2.6.32.ovz/arch/sparc/kernel/sys_sparc32.c --- linux-2.6.32/arch/sparc/kernel/sys_sparc32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/kernel/sys_sparc32.c 2015-03-10 13:23:54.000000000 -0700 @@ -26,11 +26,6 @@ #include #include #include -#include -#include -#include -#include -#include #include #include #include diff -uprN linux-2.6.32/arch/sparc/kernel/sys_sparc_64.c linux-2.6.32.ovz/arch/sparc/kernel/sys_sparc_64.c --- linux-2.6.32/arch/sparc/kernel/sys_sparc_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/kernel/sys_sparc_64.c 2015-03-10 13:21:11.000000000 -0700 @@ -317,10 +317,14 @@ bottomup: unsigned long get_fb_unmapped_area(struct file *filp, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) { unsigned long align_goal, addr = -ENOMEM; + unsigned long (*get_area)(struct file *, unsigned long, + unsigned long, unsigned long, unsigned long); + + get_area = current->mm->get_unmapped_area; if (flags & MAP_FIXED) { /* Ok, don't mess with it. */ - return get_unmapped_area(NULL, orig_addr, len, pgoff, flags); + return get_area(NULL, orig_addr, len, pgoff, flags); } flags &= ~MAP_SHARED; @@ -333,7 +337,7 @@ unsigned long get_fb_unmapped_area(struc align_goal = (64UL * 1024); do { - addr = get_unmapped_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); + addr = get_area(NULL, orig_addr, len + (align_goal - PAGE_SIZE), pgoff, flags); if (!(addr & ~PAGE_MASK)) { addr = (addr + (align_goal - 1UL)) & ~(align_goal - 1UL); break; @@ -351,7 +355,7 @@ unsigned long get_fb_unmapped_area(struc * be obtained. */ if (addr & ~PAGE_MASK) - addr = get_unmapped_area(NULL, orig_addr, len, pgoff, flags); + addr = get_area(NULL, orig_addr, len, pgoff, flags); return addr; } @@ -399,18 +403,6 @@ void arch_pick_mmap_layout(struct mm_str } } -SYSCALL_DEFINE1(sparc_brk, unsigned long, brk) -{ - /* People could try to be nasty and use ta 0x6d in 32bit programs */ - if (test_thread_flag(TIF_32BIT) && brk >= STACK_TOP32) - return current->mm->brk; - - if (unlikely(straddles_64bit_va_hole(current->mm->brk, brk))) - return current->mm->brk; - - return sys_brk(brk); -} - /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way unix traditionally does this, though. @@ -568,23 +560,13 @@ SYSCALL_DEFINE6(mmap, unsigned long, add unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { - struct file * file = NULL; - unsigned long retval = -EBADF; - - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - len = PAGE_ALIGN(len); + unsigned long retval = -EINVAL; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(file, addr, len, prot, flags, off); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + if ((off + PAGE_ALIGN(len)) < off) + goto out; + if (off & ~PAGE_MASK) + goto out; + retval = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return retval; } @@ -614,12 +596,6 @@ SYSCALL_DEFINE5(64_mremap, unsigned long if (test_thread_flag(TIF_32BIT)) goto out; - if (unlikely(new_len >= VA_EXCLUDE_START)) - goto out; - if (unlikely(sparc_mmap_check(addr, old_len))) - goto out; - if (unlikely(sparc_mmap_check(new_addr, new_len))) - goto out; down_write(¤t->mm->mmap_sem); ret = do_mremap(addr, old_len, new_len, flags, new_addr); diff -uprN linux-2.6.32/arch/sparc/kernel/systbls_32.S linux-2.6.32.ovz/arch/sparc/kernel/systbls_32.S --- linux-2.6.32/arch/sparc/kernel/systbls_32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/kernel/systbls_32.S 2015-03-10 13:21:18.000000000 -0700 @@ -19,7 +19,7 @@ sys_call_table: /*0*/ .long sys_restart_syscall, sys_exit, sys_fork, sys_read, sys_write /*5*/ .long sys_open, sys_close, sys_wait4, sys_creat, sys_link /*10*/ .long sys_unlink, sunos_execv, sys_chdir, sys_chown16, sys_mknod -/*15*/ .long sys_chmod, sys_lchown16, sparc_brk, sys_nis_syscall, sys_lseek +/*15*/ .long sys_chmod, sys_lchown16, sys_brk, sys_nis_syscall, sys_lseek /*20*/ .long sys_getpid, sys_capget, sys_capset, sys_setuid16, sys_getuid16 /*25*/ .long sys_vmsplice, sys_ptrace, sys_alarm, sys_sigaltstack, sys_pause /*30*/ .long sys_utime, sys_lchown, sys_fchown, sys_access, sys_nice @@ -67,7 +67,7 @@ sys_call_table: /*235*/ .long sys_fstatfs64, sys_llseek, sys_mlock, sys_munlock, sys_mlockall /*240*/ .long sys_munlockall, sys_sched_setparam, sys_sched_getparam, sys_sched_setscheduler, sys_sched_getscheduler /*245*/ .long sys_sched_yield, sys_sched_get_priority_max, sys_sched_get_priority_min, sys_sched_rr_get_interval, sys_nanosleep -/*250*/ .long sparc_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl +/*250*/ .long sys_mremap, sys_sysctl, sys_getsid, sys_fdatasync, sys_nfsservctl /*255*/ .long sys_sync_file_range, sys_clock_settime, sys_clock_gettime, sys_clock_getres, sys_clock_nanosleep /*260*/ .long sys_sched_getaffinity, sys_sched_setaffinity, sys_timer_settime, sys_timer_gettime, sys_timer_getoverrun /*265*/ .long sys_timer_delete, sys_timer_create, sys_nis_syscall, sys_io_setup, sys_io_destroy @@ -82,5 +82,5 @@ sys_call_table: /*310*/ .long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate /*315*/ .long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv -/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open +/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg diff -uprN linux-2.6.32/arch/sparc/kernel/systbls_64.S linux-2.6.32.ovz/arch/sparc/kernel/systbls_64.S --- linux-2.6.32/arch/sparc/kernel/systbls_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/kernel/systbls_64.S 2015-03-10 13:21:18.000000000 -0700 @@ -21,7 +21,7 @@ sys_call_table32: /*0*/ .word sys_restart_syscall, sys32_exit, sys_fork, sys_read, sys_write /*5*/ .word sys32_open, sys_close, sys32_wait4, sys32_creat, sys_link /*10*/ .word sys_unlink, sunos_execv, sys_chdir, sys_chown16, sys32_mknod -/*15*/ .word sys_chmod, sys_lchown16, sys_sparc_brk, sys32_perfctr, sys32_lseek +/*15*/ .word sys_chmod, sys_lchown16, sys_brk, sys32_perfctr, sys32_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid16, sys_getuid16 /*25*/ .word sys32_vmsplice, compat_sys_ptrace, sys_alarm, sys32_sigaltstack, sys_pause /*30*/ .word compat_sys_utime, sys_lchown, sys_fchown, sys32_access, sys32_nice @@ -83,7 +83,7 @@ sys_call_table32: /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv - .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open + .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open, compat_sys_recvmmsg #endif /* CONFIG_COMPAT */ @@ -96,7 +96,7 @@ sys_call_table: /*0*/ .word sys_restart_syscall, sparc_exit, sys_fork, sys_read, sys_write /*5*/ .word sys_open, sys_close, sys_wait4, sys_creat, sys_link /*10*/ .word sys_unlink, sys_nis_syscall, sys_chdir, sys_chown, sys_mknod -/*15*/ .word sys_chmod, sys_lchown, sys_sparc_brk, sys_perfctr, sys_lseek +/*15*/ .word sys_chmod, sys_lchown, sys_brk, sys_perfctr, sys_lseek /*20*/ .word sys_getpid, sys_capget, sys_capset, sys_setuid, sys_getuid /*25*/ .word sys_vmsplice, sys_ptrace, sys_alarm, sys_sigaltstack, sys_nis_syscall /*30*/ .word sys_utime, sys_nis_syscall, sys_nis_syscall, sys_access, sys_nice @@ -158,4 +158,4 @@ sys_call_table: /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv - .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open + .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open, sys_recvmmsg diff -uprN linux-2.6.32/arch/sparc/kernel/systbls.h linux-2.6.32.ovz/arch/sparc/kernel/systbls.h --- linux-2.6.32/arch/sparc/kernel/systbls.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/kernel/systbls.h 2015-03-10 13:21:11.000000000 -0700 @@ -9,7 +9,6 @@ struct new_utsname; extern asmlinkage unsigned long sys_getpagesize(void); -extern asmlinkage unsigned long sparc_brk(unsigned long brk); extern asmlinkage long sparc_pipe(struct pt_regs *regs); extern asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second, diff -uprN linux-2.6.32/arch/sparc/lib/mcount.S linux-2.6.32.ovz/arch/sparc/lib/mcount.S --- linux-2.6.32/arch/sparc/lib/mcount.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/lib/mcount.S 2015-03-10 13:21:04.000000000 -0700 @@ -64,8 +64,9 @@ mcount: 2: sethi %hi(softirq_stack), %g3 or %g3, %lo(softirq_stack), %g3 ldx [%g3 + %g1], %g7 + sub %g7, STACK_BIAS, %g7 cmp %sp, %g7 - bleu,pt %xcc, 2f + bleu,pt %xcc, 3f sethi %hi(THREAD_SIZE), %g3 add %g7, %g3, %g7 cmp %sp, %g7 @@ -75,7 +76,7 @@ mcount: * again, we are already trying to output the stack overflow * message. */ - sethi %hi(ovstack), %g7 ! cant move to panic stack fast enough +3: sethi %hi(ovstack), %g7 ! cant move to panic stack fast enough or %g7, %lo(ovstack), %g7 add %g7, OVSTACKSIZE, %g3 sub %g3, STACK_BIAS + 192, %g3 diff -uprN linux-2.6.32/arch/sparc/Makefile linux-2.6.32.ovz/arch/sparc/Makefile --- linux-2.6.32/arch/sparc/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/Makefile 2015-03-10 13:21:04.000000000 -0700 @@ -27,6 +27,7 @@ AS := $(AS) -32 LDFLAGS := -m elf32_sparc CHECKFLAGS += -D__sparc__ export BITS := 32 +UTS_MACHINE := sparc #KBUILD_CFLAGS += -g -pipe -fcall-used-g5 -fcall-used-g7 KBUILD_CFLAGS += -m32 -pipe -mno-fpu -fcall-used-g5 -fcall-used-g7 @@ -46,6 +47,7 @@ CHECKFLAGS += -D__sparc__ -D__sparc LDFLAGS := -m elf64_sparc export BITS := 64 +UTS_MACHINE := sparc64 KBUILD_CFLAGS += -m64 -pipe -mno-fpu -mcpu=ultrasparc -mcmodel=medlow \ -ffixed-g4 -ffixed-g5 -fcall-used-g7 -Wno-sign-compare \ diff -uprN linux-2.6.32/arch/sparc/mm/hugetlbpage.c linux-2.6.32.ovz/arch/sparc/mm/hugetlbpage.c --- linux-2.6.32/arch/sparc/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/mm/hugetlbpage.c 2015-03-10 13:23:25.000000000 -0700 @@ -196,7 +196,8 @@ hugetlb_get_unmapped_area(struct file *f } pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) + unsigned long addr, unsigned long sz, + bool *shared) { pgd_t *pgd; pud_t *pud; diff -uprN linux-2.6.32/arch/sparc/mm/init_32.c linux-2.6.32.ovz/arch/sparc/mm/init_32.c --- linux-2.6.32/arch/sparc/mm/init_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/sparc/mm/init_32.c 2015-03-10 13:22:29.000000000 -0700 @@ -74,7 +74,7 @@ void __init kmap_init(void) kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void show_mem(void) +void show_mem(unsigned int filter) { printk("Mem-info:\n"); show_free_areas(); diff -uprN linux-2.6.32/arch/um/drivers/mconsole_kern.c linux-2.6.32.ovz/arch/um/drivers/mconsole_kern.c --- linux-2.6.32/arch/um/drivers/mconsole_kern.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/um/drivers/mconsole_kern.c 2015-03-10 13:21:24.000000000 -0700 @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -131,7 +132,7 @@ void mconsole_proc(struct mc_request *re char *ptr = req->request.data, *buf; ptr += strlen("proc"); - while (isspace(*ptr)) ptr++; + ptr = skip_spaces(ptr); proc = get_fs_type("proc"); if (proc == NULL) { @@ -212,8 +213,7 @@ void mconsole_proc(struct mc_request *re char *ptr = req->request.data; ptr += strlen("proc"); - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); snprintf(path, sizeof(path), "/proc/%s", ptr); fd = sys_open(path, 0, 0); @@ -560,8 +560,7 @@ void mconsole_config(struct mc_request * int err; ptr += strlen("config"); - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); dev = mconsole_find_dev(ptr); if (dev == NULL) { mconsole_reply(req, "Bad configuration option", 1, 0); @@ -588,7 +587,7 @@ void mconsole_remove(struct mc_request * int err, start, end, n; ptr += strlen("remove"); - while (isspace(*ptr)) ptr++; + ptr = skip_spaces(ptr); dev = mconsole_find_dev(ptr); if (dev == NULL) { mconsole_reply(req, "Bad remove option", 1, 0); @@ -712,7 +711,7 @@ void mconsole_sysrq(struct mc_request *r char *ptr = req->request.data; ptr += strlen("sysrq"); - while (isspace(*ptr)) ptr++; + ptr = skip_spaces(ptr); /* * With 'b', the system will shut down without a chance to reply, @@ -757,8 +756,7 @@ void mconsole_stack(struct mc_request *r */ ptr += strlen("stack"); - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); /* * Should really check for multiple pids or reject bad args here diff -uprN linux-2.6.32/arch/um/drivers/ubd_kern.c linux-2.6.32.ovz/arch/um/drivers/ubd_kern.c --- linux-2.6.32/arch/um/drivers/ubd_kern.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/um/drivers/ubd_kern.c 2015-03-10 13:21:17.000000000 -0700 @@ -747,7 +747,7 @@ static int ubd_open_dev(struct ubd *ubd_ ubd_dev->fd = fd; if(ubd_dev->cow.file != NULL){ - blk_queue_max_sectors(ubd_dev->queue, 8 * sizeof(long)); + blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long)); err = -ENOMEM; ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len); @@ -849,7 +849,7 @@ static int ubd_add(int n, char **error_o } ubd_dev->queue->queuedata = ubd_dev; - blk_queue_max_hw_segments(ubd_dev->queue, MAX_SG); + blk_queue_max_segments(ubd_dev->queue, MAX_SG); err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, &ubd_gendisk[n]); if(err){ *error_out = "Failed to register device"; diff -uprN linux-2.6.32/arch/um/kernel/syscall.c linux-2.6.32.ovz/arch/um/kernel/syscall.c --- linux-2.6.32/arch/um/kernel/syscall.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/um/kernel/syscall.c 2015-03-10 13:21:11.000000000 -0700 @@ -8,6 +8,7 @@ #include "linux/mm.h" #include "linux/sched.h" #include "linux/utsname.h" +#include "linux/syscalls.h" #include "asm/current.h" #include "asm/mman.h" #include "asm/uaccess.h" @@ -37,31 +38,6 @@ long sys_vfork(void) return ret; } -/* common code for old and new mmaps */ -long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - long error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); - out: - return error; -} - long old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset) @@ -70,7 +46,7 @@ long old_mmap(unsigned long addr, unsign if (offset & ~PAGE_MASK) goto out; - err = sys_mmap2(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); + err = sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); out: return err; } diff -uprN linux-2.6.32/arch/um/sys-i386/asm/elf.h linux-2.6.32.ovz/arch/um/sys-i386/asm/elf.h --- linux-2.6.32/arch/um/sys-i386/asm/elf.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/um/sys-i386/asm/elf.h 2015-03-10 13:21:24.000000000 -0700 @@ -117,47 +117,4 @@ do { \ } \ } while (0) -/* - * These macros parameterize elf_core_dump in fs/binfmt_elf.c to write out - * extra segments containing the vsyscall DSO contents. Dumping its - * contents makes post-mortem fully interpretable later without matching up - * the same kernel and hardware config to see what PC values meant. - * Dumping its extra ELF program headers includes all the other information - * a debugger needs to easily find how the vsyscall DSO was being used. - */ -#define ELF_CORE_EXTRA_PHDRS \ - (vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0 ) - -#define ELF_CORE_WRITE_EXTRA_PHDRS \ -if ( vsyscall_ehdr ) { \ - const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \ - const struct elf_phdr *const phdrp = \ - (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \ - int i; \ - Elf32_Off ofs = 0; \ - for (i = 0; i < ehdrp->e_phnum; ++i) { \ - struct elf_phdr phdr = phdrp[i]; \ - if (phdr.p_type == PT_LOAD) { \ - ofs = phdr.p_offset = offset; \ - offset += phdr.p_filesz; \ - } \ - else \ - phdr.p_offset += ofs; \ - phdr.p_paddr = 0; /* match other core phdrs */ \ - DUMP_WRITE(&phdr, sizeof(phdr)); \ - } \ -} -#define ELF_CORE_WRITE_EXTRA_DATA \ -if ( vsyscall_ehdr ) { \ - const struct elfhdr *const ehdrp = (struct elfhdr *)vsyscall_ehdr; \ - const struct elf_phdr *const phdrp = \ - (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); \ - int i; \ - for (i = 0; i < ehdrp->e_phnum; ++i) { \ - if (phdrp[i].p_type == PT_LOAD) \ - DUMP_WRITE((void *) phdrp[i].p_vaddr, \ - phdrp[i].p_filesz); \ - } \ -} - #endif diff -uprN linux-2.6.32/arch/um/sys-i386/elfcore.c linux-2.6.32.ovz/arch/um/sys-i386/elfcore.c --- linux-2.6.32/arch/um/sys-i386/elfcore.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/um/sys-i386/elfcore.c 2015-03-10 13:21:24.000000000 -0700 @@ -0,0 +1,83 @@ +#include +#include +#include +#include + +#include + + +Elf32_Half elf_core_extra_phdrs(void) +{ + return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + Elf32_Off ofs = 0; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + struct elf_phdr phdr = phdrp[i]; + + if (phdr.p_type == PT_LOAD) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit + || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + if (phdrp[i].p_type == PT_LOAD) { + void *addr = (void *) phdrp[i].p_vaddr; + size_t filesz = phdrp[i].p_filesz; + + *size += filesz; + if (*size > limit + || !dump_write(file, addr, filesz)) + return 0; + } + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *)vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) + if (phdrp[i].p_type == PT_LOAD) + return (size_t) phdrp[i].p_filesz; + } + return 0; +} diff -uprN linux-2.6.32/arch/um/sys-i386/Makefile linux-2.6.32.ovz/arch/um/sys-i386/Makefile --- linux-2.6.32/arch/um/sys-i386/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/um/sys-i386/Makefile 2015-03-10 13:21:24.000000000 -0700 @@ -6,6 +6,8 @@ obj-y = bug.o bugs.o checksum.o delay.o ptrace_user.o setjmp.o signal.o stub.o stub_segv.o syscalls.o sysrq.o \ sys_call_table.o tls.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + subarch-obj-y = lib/semaphore_32.o lib/string_32.o subarch-obj-$(CONFIG_HIGHMEM) += mm/highmem_32.o subarch-obj-$(CONFIG_MODULES) += kernel/module.o diff -uprN linux-2.6.32/arch/um/sys-i386/shared/sysdep/syscalls.h linux-2.6.32.ovz/arch/um/sys-i386/shared/sysdep/syscalls.h --- linux-2.6.32/arch/um/sys-i386/shared/sysdep/syscalls.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/um/sys-i386/shared/sysdep/syscalls.h 2015-03-10 13:21:11.000000000 -0700 @@ -20,7 +20,3 @@ extern syscall_handler_t *sys_call_table #define EXECUTE_SYSCALL(syscall, regs) \ ((long (*)(struct syscall_args)) \ (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) - -extern long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff); diff -uprN linux-2.6.32/arch/um/sys-x86_64/Makefile linux-2.6.32.ovz/arch/um/sys-x86_64/Makefile --- linux-2.6.32/arch/um/sys-x86_64/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/um/sys-x86_64/Makefile 2015-03-10 13:21:14.000000000 -0700 @@ -8,7 +8,8 @@ obj-y = bug.o bugs.o delay.o fault.o ldt setjmp.o signal.o stub.o stub_segv.o syscalls.o syscall_table.o \ sysrq.o ksyms.o tls.o -subarch-obj-y = lib/csum-partial_64.o lib/memcpy_64.o lib/thunk_64.o +subarch-obj-y = lib/csum-partial_64.o lib/memcpy_64.o lib/thunk_64.o \ + lib/rwsem_64.o subarch-obj-$(CONFIG_MODULES) += kernel/module.o ldt-y = ../sys-i386/ldt.o diff -uprN linux-2.6.32/arch/x86/boot/boot.h linux-2.6.32.ovz/arch/x86/boot/boot.h --- linux-2.6.32/arch/x86/boot/boot.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/boot/boot.h 2015-03-10 13:22:35.000000000 -0700 @@ -294,7 +294,7 @@ int cmdline_find_option_bool(const char struct cpu_features { int level; /* Family, or 64 for x86-64 */ int model; - u32 flags[NCAPINTS]; + u32 flags[RHNCAPINTS]; }; extern struct cpu_features cpu; int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); diff -uprN linux-2.6.32/arch/x86/boot/cpu.c linux-2.6.32.ovz/arch/x86/boot/cpu.c --- linux-2.6.32/arch/x86/boot/cpu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/boot/cpu.c 2015-03-10 13:22:35.000000000 -0700 @@ -55,7 +55,7 @@ int validate_cpu(void) msg_strs = (const unsigned char *)x86_cap_strs; - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < RHNCAPINTS; i++) { u32 e = err_flags[i]; for (j = 0; j < 32; j++) { diff -uprN linux-2.6.32/arch/x86/boot/cpucheck.c linux-2.6.32.ovz/arch/x86/boot/cpucheck.c --- linux-2.6.32/arch/x86/boot/cpucheck.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/boot/cpucheck.c 2015-03-10 13:22:35.000000000 -0700 @@ -30,11 +30,11 @@ struct cpu_features cpu; static u32 cpu_vendor[3]; -static u32 err_flags[NCAPINTS]; +static u32 err_flags[RHNCAPINTS]; static const int req_level = CONFIG_X86_MINIMUM_CPU_FAMILY; -static const u32 req_flags[NCAPINTS] = +static const u32 req_flags[RHNCAPINTS] = { REQUIRED_MASK0, REQUIRED_MASK1, @@ -160,7 +160,7 @@ static int check_flags(void) int i; err = 0; - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < RHNCAPINTS; i++) { err_flags[i] = req_flags[i] & ~cpu.flags[i]; if (err_flags[i]) err |= 1 << i; diff -uprN linux-2.6.32/arch/x86/boot/mkcpustr.c linux-2.6.32.ovz/arch/x86/boot/mkcpustr.c --- linux-2.6.32/arch/x86/boot/mkcpustr.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/boot/mkcpustr.c 2015-03-10 13:22:35.000000000 -0700 @@ -22,13 +22,13 @@ int main(void) int i, j; const char *str; - printf("static const char x86_cap_strs[] = \n"); + printf("static const char x86_cap_strs[] =\n"); - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < RHNCAPINTS; i++) { for (j = 0; j < 32; j++) { str = x86_cap_flags[i*32+j]; - if (i == NCAPINTS-1 && j == 31) { + if (i == RHNCAPINTS-1 && j == 31) { /* The last entry must be unconditional; this also consumes the compiler-added null character */ diff -uprN linux-2.6.32/arch/x86/boot/video-vga.c linux-2.6.32.ovz/arch/x86/boot/video-vga.c --- linux-2.6.32/arch/x86/boot/video-vga.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/boot/video-vga.c 2015-03-10 13:22:51.000000000 -0700 @@ -41,13 +41,12 @@ static __videocard video_vga; static u8 vga_set_basic_mode(void) { struct biosregs ireg, oreg; - u16 ax; u8 rows; u8 mode; initregs(&ireg); - ax = 0x0f00; + ireg.ax = 0x0f00; intcall(0x10, &ireg, &oreg); mode = oreg.al; diff -uprN linux-2.6.32/arch/x86/configs/i386_defconfig linux-2.6.32.ovz/arch/x86/configs/i386_defconfig --- linux-2.6.32/arch/x86/configs/i386_defconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/configs/i386_defconfig 2015-03-10 13:22:22.000000000 -0700 @@ -463,7 +463,7 @@ CONFIG_ISA_DMA_API=y # CONFIG_MCA is not set # CONFIG_SCx200 is not set # CONFIG_OLPC is not set -CONFIG_K8_NB=y +CONFIG_AMD_NB=y CONFIG_PCCARD=y # CONFIG_PCMCIA_DEBUG is not set CONFIG_PCMCIA=y @@ -1471,6 +1471,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set +# CONFIG_SENSORS_PKGTEMP is not set # CONFIG_SENSORS_IT87 is not set # CONFIG_SENSORS_LM63 is not set # CONFIG_SENSORS_LM75 is not set diff -uprN linux-2.6.32/arch/x86/configs/x86_64_defconfig linux-2.6.32.ovz/arch/x86/configs/x86_64_defconfig --- linux-2.6.32/arch/x86/configs/x86_64_defconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/configs/x86_64_defconfig 2015-03-10 13:22:22.000000000 -0700 @@ -293,7 +293,7 @@ CONFIG_X86_CPUID=y CONFIG_ARCH_PHYS_ADDR_T_64BIT=y CONFIG_DIRECT_GBPAGES=y CONFIG_NUMA=y -CONFIG_K8_NUMA=y +CONFIG_AMD_NUMA=y CONFIG_X86_64_ACPI_NUMA=y CONFIG_NODES_SPAN_OTHER_NODES=y # CONFIG_NUMA_EMU is not set @@ -455,7 +455,7 @@ CONFIG_PCI_MSI=y CONFIG_HT_IRQ=y # CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y -CONFIG_K8_NB=y +CONFIG_AMD_NB=y CONFIG_PCCARD=y # CONFIG_PCMCIA_DEBUG is not set CONFIG_PCMCIA=y @@ -1456,6 +1456,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set +# CONFIG_SENSORS_PKGTEMP is not set # CONFIG_SENSORS_IT87 is not set # CONFIG_SENSORS_LM63 is not set # CONFIG_SENSORS_LM75 is not set diff -uprN linux-2.6.32/arch/x86/crypto/ablk_helper.c linux-2.6.32.ovz/arch/x86/crypto/ablk_helper.c --- linux-2.6.32/arch/x86/crypto/ablk_helper.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/ablk_helper.c 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,149 @@ +/* + * Shared async block cipher helpers + * + * Copyright (c) 2012 Jussi Kivilinna + * + * Based on aesni-intel_glue.c by: + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len) +{ + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, key_len); + crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) + & CRYPTO_TFM_RES_MASK); + return err; +} +EXPORT_SYMBOL_GPL(ablk_set_key); + +int __ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->encrypt( + &desc, req->dst, req->src, req->nbytes); +} +EXPORT_SYMBOL_GPL(__ablk_encrypt); + +int ablk_encrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_encrypt(cryptd_req); + } else { + return __ablk_encrypt(req); + } +} +EXPORT_SYMBOL_GPL(ablk_encrypt); + +int ablk_decrypt(struct ablkcipher_request *req) +{ + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct async_helper_ctx *ctx = crypto_ablkcipher_ctx(tfm); + + if (!irq_fpu_usable()) { + struct ablkcipher_request *cryptd_req = + ablkcipher_request_ctx(req); + + memcpy(cryptd_req, req, sizeof(*req)); + ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + + return crypto_ablkcipher_decrypt(cryptd_req); + } else { + struct blkcipher_desc desc; + + desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); + desc.info = req->info; + desc.flags = 0; + + return crypto_blkcipher_crt(desc.tfm)->decrypt( + &desc, req->dst, req->src, req->nbytes); + } +} +EXPORT_SYMBOL_GPL(ablk_decrypt); + +void ablk_exit(struct crypto_tfm *tfm) +{ + struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ablkcipher(ctx->cryptd_tfm); +} +EXPORT_SYMBOL_GPL(ablk_exit); + +int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name) +{ + struct async_helper_ctx *ctx = crypto_tfm_ctx(tfm); + struct cryptd_ablkcipher *cryptd_tfm; + + cryptd_tfm = cryptd_alloc_ablkcipher(drv_name, 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + + crypto_ablkcipher_reqsize(&cryptd_tfm->base); + + return 0; +} +EXPORT_SYMBOL_GPL(ablk_init_common); + +int ablk_init(struct crypto_tfm *tfm) +{ + char drv_name[CRYPTO_MAX_ALG_NAME]; + + snprintf(drv_name, sizeof(drv_name), "__driver-%s", + crypto_tfm_alg_driver_name(tfm)); + + return ablk_init_common(tfm, drv_name); +} +EXPORT_SYMBOL_GPL(ablk_init); + +MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/x86/crypto/aes_glue.c linux-2.6.32.ovz/arch/x86/crypto/aes_glue.c --- linux-2.6.32/arch/x86/crypto/aes_glue.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/aes_glue.c 2015-03-10 13:23:55.000000000 -0700 @@ -4,6 +4,7 @@ */ #include +#include asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); diff -uprN linux-2.6.32/arch/x86/crypto/aes-i586-asm_32.S linux-2.6.32.ovz/arch/x86/crypto/aes-i586-asm_32.S --- linux-2.6.32/arch/x86/crypto/aes-i586-asm_32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/aes-i586-asm_32.S 2015-03-10 13:23:55.000000000 -0700 @@ -36,6 +36,7 @@ .file "aes-i586-asm.S" .text +#include #include #define tlen 1024 // length of each of 4 'xor' arrays (256 32-bit words) @@ -219,14 +220,10 @@ // AES (Rijndael) Encryption Subroutine /* void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ -.global aes_enc_blk - .extern crypto_ft_tab .extern crypto_fl_tab -.align 4 - -aes_enc_blk: +ENTRY(aes_enc_blk) push %ebp mov ctx(%esp),%ebp @@ -290,18 +287,15 @@ aes_enc_blk: mov %r0,(%ebp) pop %ebp ret +ENDPROC(aes_enc_blk) // AES (Rijndael) Decryption Subroutine /* void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out_blk, const u8 *in_blk) */ -.global aes_dec_blk - .extern crypto_it_tab .extern crypto_il_tab -.align 4 - -aes_dec_blk: +ENTRY(aes_dec_blk) push %ebp mov ctx(%esp),%ebp @@ -365,3 +359,4 @@ aes_dec_blk: mov %r0,(%ebp) pop %ebp ret +ENDPROC(aes_dec_blk) diff -uprN linux-2.6.32/arch/x86/crypto/aesni-intel_asm.S linux-2.6.32.ovz/arch/x86/crypto/aesni-intel_asm.S --- linux-2.6.32/arch/x86/crypto/aesni-intel_asm.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/aesni-intel_asm.S 2015-03-10 13:24:15.000000000 -0700 @@ -9,6 +9,20 @@ * Vinodh Gopal * Kahraman Akdemir * + * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD + * interface for 64-bit kernels. + * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) + * Aidan O'Mahony (aidan.o.mahony@intel.com) + * Adrian Hoban + * James Guilford (james.guilford@intel.com) + * Gabriele Paoloni + * Tadeusz Struk (tadeusz.struk@intel.com) + * Wajdi Feghali (wajdi.k.feghali@intel.com) + * Copyright (c) 2010, Intel Corporation. + * + * Ported x86_64 version to x86: + * Author: Mathias Krause + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -16,9 +30,80 @@ */ #include +#include + +/* + * The following macros are used to move an (un)aligned 16 byte value to/from + * an XMM register. This can done for either FP or integer values, for FP use + * movaps (move aligned packed single) or integer use movdqa (move double quad + * aligned). It doesn't make a performance difference which instruction is used + * since Nehalem (original Core i7) was released. However, the movaps is a byte + * shorter, so that is the one we'll use for now. (same for unaligned). + */ +#define MOVADQ movaps +#define MOVUDQ movups + +#ifdef __x86_64__ + +.data +.align 16 +.Lgf128mul_x_ble_mask: + .octa 0x00000000000000010000000000000087 +POLY: .octa 0xC2000000000000000000000000000001 +TWOONE: .octa 0x00000001000000000000000000000001 + +# order of these constants should not change. +# more specifically, ALL_F should follow SHIFT_MASK, +# and ZERO should follow ALL_F + +SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +MASK1: .octa 0x0000000000000000ffffffffffffffff +MASK2: .octa 0xffffffffffffffff0000000000000000 +SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +ALL_F: .octa 0xffffffffffffffffffffffffffffffff +ZERO: .octa 0x00000000000000000000000000000000 +ONE: .octa 0x00000000000000000000000000000001 +F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 +dec: .octa 0x1 +enc: .octa 0x2 + .text + +#define STACK_OFFSET 8*3 +#define HashKey 16*0 // store HashKey <<1 mod poly here +#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here +#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here +#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here +#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64 + // bits of HashKey <<1 mod poly here + //(for Karatsuba purposes) +#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64 + // bits of HashKey^2 <<1 mod poly here + // (for Karatsuba purposes) +#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64 + // bits of HashKey^3 <<1 mod poly here + // (for Karatsuba purposes) +#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64 + // bits of HashKey^4 <<1 mod poly here + // (for Karatsuba purposes) +#define VARIABLE_OFFSET 16*8 + +#define arg1 rdi +#define arg2 rsi +#define arg3 rdx +#define arg4 rcx +#define arg5 r8 +#define arg6 r9 +#define arg7 STACK_OFFSET+8(%r14) +#define arg8 STACK_OFFSET+16(%r14) +#define arg9 STACK_OFFSET+24(%r14) +#define arg10 STACK_OFFSET+32(%r14) +#define keysize 2*15*16(%arg1) +#endif + + #define STATE1 %xmm0 #define STATE2 %xmm4 #define STATE3 %xmm5 @@ -32,8 +117,17 @@ #define KEY %xmm2 #define IV %xmm3 +#define BSWAP_MASK %xmm10 +#define CTR %xmm11 +#define INC %xmm12 + +#define GF128MUL_MASK %xmm10 + +#ifdef __x86_64__ +#define AREG %rax #define KEYP %rdi #define OUTP %rsi +#define UKEYP OUTP #define INP %rdx #define LEN %rcx #define IVP %r8 @@ -41,7 +135,1594 @@ #define T1 %r10 #define TKEYP T1 #define T2 %r11 +#define TCTR_LOW T2 +#else +#define AREG %eax +#define KEYP %edi +#define OUTP AREG +#define UKEYP OUTP +#define INP %edx +#define LEN %esi +#define IVP %ebp +#define KLEN %ebx +#define T1 %ecx +#define TKEYP T1 +#endif + + +#ifdef __x86_64__ +/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +* +* +* Input: A and B (128-bits each, bit-reflected) +* Output: C = A*B*x mod poly, (i.e. >>1 ) +* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +* +*/ +.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 + movdqa \GH, \TMP1 + pshufd $78, \GH, \TMP2 + pshufd $78, \HK, \TMP3 + pxor \GH, \TMP2 # TMP2 = a1+a0 + pxor \HK, \TMP3 # TMP3 = b1+b0 + PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1 + PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0 + PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) + pxor \GH, \TMP2 + pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) + movdqa \TMP2, \TMP3 + pslldq $8, \TMP3 # left shift TMP3 2 DWs + psrldq $8, \TMP2 # right shift TMP2 2 DWs + pxor \TMP3, \GH + pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK + + # first phase of the reduction + + movdqa \GH, \TMP2 + movdqa \GH, \TMP3 + movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 + # in in order to perform + # independent shifts + pslld $31, \TMP2 # packed right shift <<31 + pslld $30, \TMP3 # packed right shift <<30 + pslld $25, \TMP4 # packed right shift <<25 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + movdqa \TMP2, \TMP5 + psrldq $4, \TMP5 # right shift TMP5 1 DW + pslldq $12, \TMP2 # left shift TMP2 3 DWs + pxor \TMP2, \GH + + # second phase of the reduction + + movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 + # in in order to perform + # independent shifts + movdqa \GH,\TMP3 + movdqa \GH,\TMP4 + psrld $1,\TMP2 # packed left shift >>1 + psrld $2,\TMP3 # packed left shift >>2 + psrld $7,\TMP4 # packed left shift >>7 + pxor \TMP3,\TMP2 # xor the shifted versions + pxor \TMP4,\TMP2 + pxor \TMP5, \TMP2 + pxor \TMP2, \GH + pxor \TMP1, \GH # result is in TMP1 +.endm +/* +* if a = number of total plaintext bytes +* b = floor(a/16) +* num_initial_blocks = b mod 4 +* encrypt the initial num_initial_blocks blocks and apply ghash on +* the ciphertext +* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers +* are clobbered +* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified +*/ + + +.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ +XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation + MOVADQ SHUF_MASK(%rip), %xmm14 + mov arg7, %r10 # %r10 = AAD + mov arg8, %r12 # %r12 = aadLen + mov %r12, %r11 + pxor %xmm\i, %xmm\i + +_get_AAD_loop\num_initial_blocks\operation: + movd (%r10), \TMP1 + pslldq $12, \TMP1 + psrldq $4, %xmm\i + pxor \TMP1, %xmm\i + add $4, %r10 + sub $4, %r12 + jne _get_AAD_loop\num_initial_blocks\operation + + cmp $16, %r11 + je _get_AAD_loop2_done\num_initial_blocks\operation + + mov $16, %r12 +_get_AAD_loop2\num_initial_blocks\operation: + psrldq $4, %xmm\i + sub $4, %r12 + cmp %r11, %r12 + jne _get_AAD_loop2\num_initial_blocks\operation + +_get_AAD_loop2_done\num_initial_blocks\operation: + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + + xor %r11, %r11 # initialise the data pointer offset as zero + + # start AES for num_initial_blocks blocks + + mov %arg5, %rax # %rax = *Y0 + movdqu (%rax), \XMM0 # XMM0 = Y0 + PSHUFB_XMM %xmm14, \XMM0 + +.if (\i == 5) || (\i == 6) || (\i == 7) + MOVADQ ONE(%RIP),\TMP1 + MOVADQ (%arg1),\TMP2 +.irpc index, \i_seq + paddd \TMP1, \XMM0 # INCR Y0 + movdqa \XMM0, %xmm\index + PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap + pxor \TMP2, %xmm\index +.endr + lea 0x10(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + add $5,%eax # 128->9, 192->11, 256->13 + +aes_loop_initial_dec\num_initial_blocks: + MOVADQ (%r10),\TMP1 +.irpc index, \i_seq + AESENC \TMP1, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_initial_dec\num_initial_blocks + + MOVADQ (%r10), \TMP1 +.irpc index, \i_seq + AESENCLAST \TMP1, %xmm\index # Last Round +.endr +.irpc index, \i_seq + movdqu (%arg3 , %r11, 1), \TMP1 + pxor \TMP1, %xmm\index + movdqu %xmm\index, (%arg2 , %r11, 1) + # write back plaintext/ciphertext for num_initial_blocks + add $16, %r11 + + movdqa \TMP1, %xmm\index + PSHUFB_XMM %xmm14, %xmm\index + # prepare plaintext/ciphertext for GHASH computation +.endr +.endif + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks + +.if \i == 5 + pxor %xmm5, %xmm6 + GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm6, %xmm7 + GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.elseif \i == 6 + pxor %xmm6, %xmm7 + GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.elseif \i == 7 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.endif + cmp $64, %r13 + jl _initial_blocks_done\num_initial_blocks\operation + # no need for precomputed values +/* +* +* Precomputations for HashKey parallel with encryption of first 4 blocks. +* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +*/ + MOVADQ ONE(%rip), \TMP1 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM1 + PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM2 + PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM3 + PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM4 + PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + + MOVADQ 0(%arg1),\TMP1 + pxor \TMP1, \XMM1 + pxor \TMP1, \XMM2 + pxor \TMP1, \XMM3 + pxor \TMP1, \XMM4 + movdqa \TMP3, \TMP5 + pshufd $78, \TMP3, \TMP1 + pxor \TMP3, \TMP1 + movdqa \TMP1, HashKey_k(%rsp) + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^2<<1 (mod poly) + movdqa \TMP5, HashKey_2(%rsp) +# HashKey_2 = HashKey^2<<1 (mod poly) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_2_k(%rsp) +.irpc index, 1234 # do 4 rounds + movaps 0x10*\index(%arg1), \TMP1 + AESENC \TMP1, \XMM1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 +.endr + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_3(%rsp) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_3_k(%rsp) +.irpc index, 56789 # do next 5 rounds + movaps 0x10*\index(%arg1), \TMP1 + AESENC \TMP1, \XMM1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 +.endr + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_4(%rsp) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_4_k(%rsp) + lea 0xa0(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + sub $4,%eax # 128->0, 192->2, 256->4 + jz aes_loop_pre_dec_done\num_initial_blocks + +aes_loop_pre_dec\num_initial_blocks: + MOVADQ (%r10),\TMP2 +.irpc index, 1234 + AESENC \TMP2, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_pre_dec\num_initial_blocks + +aes_loop_pre_dec_done\num_initial_blocks: + MOVADQ (%r10), \TMP2 + AESENCLAST \TMP2, \XMM1 + AESENCLAST \TMP2, \XMM2 + AESENCLAST \TMP2, \XMM3 + AESENCLAST \TMP2, \XMM4 + movdqu 16*0(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM1 + movdqu \XMM1, 16*0(%arg2 , %r11 , 1) + movdqa \TMP1, \XMM1 + movdqu 16*1(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM2 + movdqu \XMM2, 16*1(%arg2 , %r11 , 1) + movdqa \TMP1, \XMM2 + movdqu 16*2(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM3 + movdqu \XMM3, 16*2(%arg2 , %r11 , 1) + movdqa \TMP1, \XMM3 + movdqu 16*3(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM4 + movdqu \XMM4, 16*3(%arg2 , %r11 , 1) + movdqa \TMP1, \XMM4 + add $64, %r11 + PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pxor \XMMDst, \XMM1 +# combine GHASHed value with the corresponding ciphertext + PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + +_initial_blocks_done\num_initial_blocks\operation: + +.endm + + +/* +* if a = number of total plaintext bytes +* b = floor(a/16) +* num_initial_blocks = b mod 4 +* encrypt the initial num_initial_blocks blocks and apply ghash on +* the ciphertext +* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers +* are clobbered +* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified +*/ + + +.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ +XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation + MOVADQ SHUF_MASK(%rip), %xmm14 + mov arg7, %r10 # %r10 = AAD + mov arg8, %r12 # %r12 = aadLen + mov %r12, %r11 + pxor %xmm\i, %xmm\i +_get_AAD_loop\num_initial_blocks\operation: + movd (%r10), \TMP1 + pslldq $12, \TMP1 + psrldq $4, %xmm\i + pxor \TMP1, %xmm\i + add $4, %r10 + sub $4, %r12 + jne _get_AAD_loop\num_initial_blocks\operation + cmp $16, %r11 + je _get_AAD_loop2_done\num_initial_blocks\operation + mov $16, %r12 +_get_AAD_loop2\num_initial_blocks\operation: + psrldq $4, %xmm\i + sub $4, %r12 + cmp %r11, %r12 + jne _get_AAD_loop2\num_initial_blocks\operation +_get_AAD_loop2_done\num_initial_blocks\operation: + PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data + + xor %r11, %r11 # initialise the data pointer offset as zero + + # start AES for num_initial_blocks blocks + + mov %arg5, %rax # %rax = *Y0 + movdqu (%rax), \XMM0 # XMM0 = Y0 + PSHUFB_XMM %xmm14, \XMM0 + +.if (\i == 5) || (\i == 6) || (\i == 7) + + MOVADQ ONE(%RIP),\TMP1 + MOVADQ 0(%arg1),\TMP2 +.irpc index, \i_seq + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, %xmm\index + PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap + pxor \TMP2, %xmm\index +.endr + lea 0x10(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + add $5,%eax # 128->9, 192->11, 256->13 + +aes_loop_initial_enc\num_initial_blocks: + MOVADQ (%r10),\TMP1 +.irpc index, \i_seq + AESENC \TMP1, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_initial_enc\num_initial_blocks + + MOVADQ (%r10), \TMP1 +.irpc index, \i_seq + AESENCLAST \TMP1, %xmm\index # Last Round +.endr +.irpc index, \i_seq + movdqu (%arg3 , %r11, 1), \TMP1 + pxor \TMP1, %xmm\index + movdqu %xmm\index, (%arg2 , %r11, 1) + # write back plaintext/ciphertext for num_initial_blocks + add $16, %r11 + PSHUFB_XMM %xmm14, %xmm\index + + # prepare plaintext/ciphertext for GHASH computation +.endr +.endif + GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + # apply GHASH on num_initial_blocks blocks + +.if \i == 5 + pxor %xmm5, %xmm6 + GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm6, %xmm7 + GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.elseif \i == 6 + pxor %xmm6, %xmm7 + GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.elseif \i == 7 + pxor %xmm7, %xmm8 + GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +.endif + cmp $64, %r13 + jl _initial_blocks_done\num_initial_blocks\operation + # no need for precomputed values +/* +* +* Precomputations for HashKey parallel with encryption of first 4 blocks. +* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +*/ + MOVADQ ONE(%RIP),\TMP1 + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM1 + PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM2 + PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM3 + PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + + paddd \TMP1, \XMM0 # INCR Y0 + MOVADQ \XMM0, \XMM4 + PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + + MOVADQ 0(%arg1),\TMP1 + pxor \TMP1, \XMM1 + pxor \TMP1, \XMM2 + pxor \TMP1, \XMM3 + pxor \TMP1, \XMM4 + movdqa \TMP3, \TMP5 + pshufd $78, \TMP3, \TMP1 + pxor \TMP3, \TMP1 + movdqa \TMP1, HashKey_k(%rsp) + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^2<<1 (mod poly) + movdqa \TMP5, HashKey_2(%rsp) +# HashKey_2 = HashKey^2<<1 (mod poly) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_2_k(%rsp) +.irpc index, 1234 # do 4 rounds + movaps 0x10*\index(%arg1), \TMP1 + AESENC \TMP1, \XMM1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 +.endr + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_3(%rsp) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_3_k(%rsp) +.irpc index, 56789 # do next 5 rounds + movaps 0x10*\index(%arg1), \TMP1 + AESENC \TMP1, \XMM1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 +.endr + GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +# TMP5 = HashKey^3<<1 (mod poly) + movdqa \TMP5, HashKey_4(%rsp) + pshufd $78, \TMP5, \TMP1 + pxor \TMP5, \TMP1 + movdqa \TMP1, HashKey_4_k(%rsp) + lea 0xa0(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + sub $4,%eax # 128->0, 192->2, 256->4 + jz aes_loop_pre_enc_done\num_initial_blocks + +aes_loop_pre_enc\num_initial_blocks: + MOVADQ (%r10),\TMP2 +.irpc index, 1234 + AESENC \TMP2, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_pre_enc\num_initial_blocks + +aes_loop_pre_enc_done\num_initial_blocks: + MOVADQ (%r10), \TMP2 + AESENCLAST \TMP2, \XMM1 + AESENCLAST \TMP2, \XMM2 + AESENCLAST \TMP2, \XMM3 + AESENCLAST \TMP2, \XMM4 + movdqu 16*0(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM1 + movdqu 16*1(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM2 + movdqu 16*2(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM3 + movdqu 16*3(%arg3 , %r11 , 1), \TMP1 + pxor \TMP1, \XMM4 + movdqu \XMM1, 16*0(%arg2 , %r11 , 1) + movdqu \XMM2, 16*1(%arg2 , %r11 , 1) + movdqu \XMM3, 16*2(%arg2 , %r11 , 1) + movdqu \XMM4, 16*3(%arg2 , %r11 , 1) + + add $64, %r11 + PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap + pxor \XMMDst, \XMM1 +# combine GHASHed value with the corresponding ciphertext + PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap + +_initial_blocks_done\num_initial_blocks\operation: + +.endm + +/* +* encrypt 4 blocks at a time +* ghash the 4 previously encrypted ciphertext blocks +* arg1, %arg2, %arg3 are used as pointers only, not modified +* %r11 is the data offset value +*/ +.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \ +TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation + + movdqa \XMM1, \XMM5 + movdqa \XMM2, \XMM6 + movdqa \XMM3, \XMM7 + movdqa \XMM4, \XMM8 + + movdqa SHUF_MASK(%rip), %xmm15 + # multiply TMP5 * HashKey using karatsuba + + movdqa \XMM5, \TMP4 + pshufd $78, \XMM5, \TMP6 + pxor \XMM5, \TMP6 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa HashKey_4(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + movdqa \XMM0, \XMM1 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM2 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM3 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM4 + PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap + PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + + pxor (%arg1), \XMM1 + pxor (%arg1), \XMM2 + pxor (%arg1), \XMM3 + pxor (%arg1), \XMM4 + movdqa HashKey_4_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + movaps 0x10(%arg1), \TMP1 + AESENC \TMP1, \XMM1 # Round 1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 + movaps 0x20(%arg1), \TMP1 + AESENC \TMP1, \XMM1 # Round 2 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 + movdqa \XMM6, \TMP1 + pshufd $78, \XMM6, \TMP2 + pxor \XMM6, \TMP2 + movdqa HashKey_3(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + movaps 0x30(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 3 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + movaps 0x40(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 4 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + movdqa HashKey_3_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movaps 0x50(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 5 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + pxor \TMP1, \TMP4 +# accumulate the results in TMP4:XMM5, TMP6 holds the middle part + pxor \XMM6, \XMM5 + pxor \TMP2, \TMP6 + movdqa \XMM7, \TMP1 + pshufd $78, \XMM7, \TMP2 + pxor \XMM7, \TMP2 + movdqa HashKey_2(%rsp ), \TMP5 + + # Multiply TMP5 * HashKey using karatsuba + + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + movaps 0x60(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 6 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + movaps 0x70(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 7 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + movdqa HashKey_2_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movaps 0x80(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 8 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + pxor \TMP1, \TMP4 +# accumulate the results in TMP4:XMM5, TMP6 holds the middle part + pxor \XMM7, \XMM5 + pxor \TMP2, \TMP6 + + # Multiply XMM8 * HashKey + # XMM8 and TMP5 hold the values for the two operands + + movdqa \XMM8, \TMP1 + pshufd $78, \XMM8, \TMP2 + pxor \XMM8, \TMP2 + movdqa HashKey(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + movaps 0x90(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 9 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + lea 0xa0(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + sub $4,%eax # 128->0, 192->2, 256->4 + jz aes_loop_par_enc_done + +aes_loop_par_enc: + MOVADQ (%r10),\TMP3 +.irpc index, 1234 + AESENC \TMP3, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_par_enc + +aes_loop_par_enc_done: + MOVADQ (%r10), \TMP3 + AESENCLAST \TMP3, \XMM1 # Round 10 + AESENCLAST \TMP3, \XMM2 + AESENCLAST \TMP3, \XMM3 + AESENCLAST \TMP3, \XMM4 + movdqa HashKey_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movdqu (%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK + movdqu 16(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK + movdqu 32(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK + movdqu 48(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK + movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer + movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer + PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + + pxor \TMP4, \TMP1 + pxor \XMM8, \XMM5 + pxor \TMP6, \TMP2 + pxor \TMP1, \TMP2 + pxor \XMM5, \TMP2 + movdqa \TMP2, \TMP3 + pslldq $8, \TMP3 # left shift TMP3 2 DWs + psrldq $8, \TMP2 # right shift TMP2 2 DWs + pxor \TMP3, \XMM5 + pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 + + # first phase of reduction + + movdqa \XMM5, \TMP2 + movdqa \XMM5, \TMP3 + movdqa \XMM5, \TMP4 +# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently + pslld $31, \TMP2 # packed right shift << 31 + pslld $30, \TMP3 # packed right shift << 30 + pslld $25, \TMP4 # packed right shift << 25 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + movdqa \TMP2, \TMP5 + psrldq $4, \TMP5 # right shift T5 1 DW + pslldq $12, \TMP2 # left shift T2 3 DWs + pxor \TMP2, \XMM5 + + # second phase of reduction + + movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 + movdqa \XMM5,\TMP3 + movdqa \XMM5,\TMP4 + psrld $1, \TMP2 # packed left shift >>1 + psrld $2, \TMP3 # packed left shift >>2 + psrld $7, \TMP4 # packed left shift >>7 + pxor \TMP3,\TMP2 # xor the shifted versions + pxor \TMP4,\TMP2 + pxor \TMP5, \TMP2 + pxor \TMP2, \XMM5 + pxor \TMP1, \XMM5 # result is in TMP1 + + pxor \XMM5, \XMM1 +.endm + +/* +* decrypt 4 blocks at a time +* ghash the 4 previously decrypted ciphertext blocks +* arg1, %arg2, %arg3 are used as pointers only, not modified +* %r11 is the data offset value +*/ +.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \ +TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation + + movdqa \XMM1, \XMM5 + movdqa \XMM2, \XMM6 + movdqa \XMM3, \XMM7 + movdqa \XMM4, \XMM8 + + movdqa SHUF_MASK(%rip), %xmm15 + # multiply TMP5 * HashKey using karatsuba + + movdqa \XMM5, \TMP4 + pshufd $78, \XMM5, \TMP6 + pxor \XMM5, \TMP6 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa HashKey_4(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1 + movdqa \XMM0, \XMM1 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM2 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM3 + paddd ONE(%rip), \XMM0 # INCR CNT + movdqa \XMM0, \XMM4 + PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap + PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0 + PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + + pxor (%arg1), \XMM1 + pxor (%arg1), \XMM2 + pxor (%arg1), \XMM3 + pxor (%arg1), \XMM4 + movdqa HashKey_4_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) + movaps 0x10(%arg1), \TMP1 + AESENC \TMP1, \XMM1 # Round 1 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 + movaps 0x20(%arg1), \TMP1 + AESENC \TMP1, \XMM1 # Round 2 + AESENC \TMP1, \XMM2 + AESENC \TMP1, \XMM3 + AESENC \TMP1, \XMM4 + movdqa \XMM6, \TMP1 + pshufd $78, \XMM6, \TMP2 + pxor \XMM6, \TMP2 + movdqa HashKey_3(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 + movaps 0x30(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 3 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0 + movaps 0x40(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 4 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + movdqa HashKey_3_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movaps 0x50(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 5 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + pxor \TMP1, \TMP4 +# accumulate the results in TMP4:XMM5, TMP6 holds the middle part + pxor \XMM6, \XMM5 + pxor \TMP2, \TMP6 + movdqa \XMM7, \TMP1 + pshufd $78, \XMM7, \TMP2 + pxor \XMM7, \TMP2 + movdqa HashKey_2(%rsp ), \TMP5 + + # Multiply TMP5 * HashKey using karatsuba + + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + movaps 0x60(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 6 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0 + movaps 0x70(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 7 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + movdqa HashKey_2_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movaps 0x80(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 8 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + pxor \TMP1, \TMP4 +# accumulate the results in TMP4:XMM5, TMP6 holds the middle part + pxor \XMM7, \XMM5 + pxor \TMP2, \TMP6 + + # Multiply XMM8 * HashKey + # XMM8 and TMP5 hold the values for the two operands + + movdqa \XMM8, \TMP1 + pshufd $78, \XMM8, \TMP2 + pxor \XMM8, \TMP2 + movdqa HashKey(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + movaps 0x90(%arg1), \TMP3 + AESENC \TMP3, \XMM1 # Round 9 + AESENC \TMP3, \XMM2 + AESENC \TMP3, \XMM3 + AESENC \TMP3, \XMM4 + PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0 + lea 0xa0(%arg1),%r10 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + sub $4,%eax # 128->0, 192->2, 256->4 + jz aes_loop_par_dec_done + +aes_loop_par_dec: + MOVADQ (%r10),\TMP3 +.irpc index, 1234 + AESENC \TMP3, %xmm\index +.endr + add $16,%r10 + sub $1,%eax + jnz aes_loop_par_dec + +aes_loop_par_dec_done: + MOVADQ (%r10), \TMP3 + AESENCLAST \TMP3, \XMM1 # last round + AESENCLAST \TMP3, \XMM2 + AESENCLAST \TMP3, \XMM3 + AESENCLAST \TMP3, \XMM4 + movdqa HashKey_k(%rsp), \TMP5 + PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movdqu (%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK + movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer + movdqa \TMP3, \XMM1 + movdqu 16(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK + movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer + movdqa \TMP3, \XMM2 + movdqu 32(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK + movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer + movdqa \TMP3, \XMM3 + movdqu 48(%arg3,%r11,1), \TMP3 + pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK + movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer + movdqa \TMP3, \XMM4 + PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap + PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap + + pxor \TMP4, \TMP1 + pxor \XMM8, \XMM5 + pxor \TMP6, \TMP2 + pxor \TMP1, \TMP2 + pxor \XMM5, \TMP2 + movdqa \TMP2, \TMP3 + pslldq $8, \TMP3 # left shift TMP3 2 DWs + psrldq $8, \TMP2 # right shift TMP2 2 DWs + pxor \TMP3, \XMM5 + pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 + + # first phase of reduction + + movdqa \XMM5, \TMP2 + movdqa \XMM5, \TMP3 + movdqa \XMM5, \TMP4 +# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently + pslld $31, \TMP2 # packed right shift << 31 + pslld $30, \TMP3 # packed right shift << 30 + pslld $25, \TMP4 # packed right shift << 25 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + movdqa \TMP2, \TMP5 + psrldq $4, \TMP5 # right shift T5 1 DW + pslldq $12, \TMP2 # left shift T2 3 DWs + pxor \TMP2, \XMM5 + + # second phase of reduction + + movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 + movdqa \XMM5,\TMP3 + movdqa \XMM5,\TMP4 + psrld $1, \TMP2 # packed left shift >>1 + psrld $2, \TMP3 # packed left shift >>2 + psrld $7, \TMP4 # packed left shift >>7 + pxor \TMP3,\TMP2 # xor the shifted versions + pxor \TMP4,\TMP2 + pxor \TMP5, \TMP2 + pxor \TMP2, \XMM5 + pxor \TMP1, \XMM5 # result is in TMP1 + + pxor \XMM5, \XMM1 +.endm + +/* GHASH the last 4 ciphertext blocks. */ +.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ +TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst + + # Multiply TMP6 * HashKey (using Karatsuba) + + movdqa \XMM1, \TMP6 + pshufd $78, \XMM1, \TMP2 + pxor \XMM1, \TMP2 + movdqa HashKey_4(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1 + PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0 + movdqa HashKey_4_k(%rsp), \TMP4 + PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + movdqa \XMM1, \XMMDst + movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 + + # Multiply TMP1 * HashKey (using Karatsuba) + + movdqa \XMM2, \TMP1 + pshufd $78, \XMM2, \TMP2 + pxor \XMM2, \TMP2 + movdqa HashKey_3(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0 + movdqa HashKey_3_k(%rsp), \TMP4 + PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pxor \TMP1, \TMP6 + pxor \XMM2, \XMMDst + pxor \TMP2, \XMM1 +# results accumulated in TMP6, XMMDst, XMM1 + + # Multiply TMP1 * HashKey (using Karatsuba) + + movdqa \XMM3, \TMP1 + pshufd $78, \XMM3, \TMP2 + pxor \XMM3, \TMP2 + movdqa HashKey_2(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0 + movdqa HashKey_2_k(%rsp), \TMP4 + PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pxor \TMP1, \TMP6 + pxor \XMM3, \XMMDst + pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 + + # Multiply TMP1 * HashKey (using Karatsuba) + movdqa \XMM4, \TMP1 + pshufd $78, \XMM4, \TMP2 + pxor \XMM4, \TMP2 + movdqa HashKey(%rsp), \TMP5 + PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1 + PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0 + movdqa HashKey_k(%rsp), \TMP4 + PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) + pxor \TMP1, \TMP6 + pxor \XMM4, \XMMDst + pxor \XMM1, \TMP2 + pxor \TMP6, \TMP2 + pxor \XMMDst, \TMP2 + # middle section of the temp results combined as in karatsuba algorithm + movdqa \TMP2, \TMP4 + pslldq $8, \TMP4 # left shift TMP4 2 DWs + psrldq $8, \TMP2 # right shift TMP2 2 DWs + pxor \TMP4, \XMMDst + pxor \TMP2, \TMP6 +# TMP6:XMMDst holds the result of the accumulated carry-less multiplications + # first phase of the reduction + movdqa \XMMDst, \TMP2 + movdqa \XMMDst, \TMP3 + movdqa \XMMDst, \TMP4 +# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently + pslld $31, \TMP2 # packed right shifting << 31 + pslld $30, \TMP3 # packed right shifting << 30 + pslld $25, \TMP4 # packed right shifting << 25 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + movdqa \TMP2, \TMP7 + psrldq $4, \TMP7 # right shift TMP7 1 DW + pslldq $12, \TMP2 # left shift TMP2 3 DWs + pxor \TMP2, \XMMDst + + # second phase of the reduction + movdqa \XMMDst, \TMP2 + # make 3 copies of XMMDst for doing 3 shift operations + movdqa \XMMDst, \TMP3 + movdqa \XMMDst, \TMP4 + psrld $1, \TMP2 # packed left shift >> 1 + psrld $2, \TMP3 # packed left shift >> 2 + psrld $7, \TMP4 # packed left shift >> 7 + pxor \TMP3, \TMP2 # xor the shifted versions + pxor \TMP4, \TMP2 + pxor \TMP7, \TMP2 + pxor \TMP2, \XMMDst + pxor \TMP6, \XMMDst # reduced result is in XMMDst +.endm + + +/* Encryption of a single block +* uses eax & r10 +*/ + +.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 + + pxor (%arg1), \XMM0 + mov keysize,%eax + shr $2,%eax # 128->4, 192->6, 256->8 + add $5,%eax # 128->9, 192->11, 256->13 + lea 16(%arg1), %r10 # get first expanded key address + +_esb_loop_\@: + MOVADQ (%r10),\TMP1 + AESENC \TMP1,\XMM0 + add $16,%r10 + sub $1,%eax + jnz _esb_loop_\@ + + MOVADQ (%r10),\TMP1 + AESENCLAST \TMP1,\XMM0 +.endm +/***************************************************************************** +* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* u8 *out, // Plaintext output. Encrypt in-place is allowed. +* const u8 *in, // Ciphertext input +* u64 plaintext_len, // Length of data in bytes for decryption. +* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) +* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) +* // concatenated with 0x00000001. 16-byte aligned pointer. +* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. +* const u8 *aad, // Additional Authentication Data (AAD) +* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes +* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the +* // given authentication tag and only return the plaintext if they match. +* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 +* // (most likely), 12 or 8. +* +* Assumptions: +* +* keys: +* keys are pre-expanded and aligned to 16 bytes. we are using the first +* set of 11 keys in the data structure void *aes_ctx +* +* iv: +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | Salt (From the SA) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | Initialization Vector | +* | (This is the sequence number from IPSec header) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x1 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* +* +* AAD: +* AAD padded to 128 bits with 0 +* for example, assume AAD is a u32 vector +* +* if AAD is 8 bytes: +* AAD[3] = {A0, A1}; +* padded AAD in xmm register = {A1 A0 0 0} +* +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | SPI (A1) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 32-bit Sequence Number (A0) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x0 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* AAD Format with 32-bit Sequence Number +* +* if AAD is 12 bytes: +* AAD[3] = {A0, A1, A2}; +* padded AAD in xmm register = {A2 A1 A0 0} +* +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | SPI (A2) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 64-bit Extended Sequence Number {A1,A0} | +* | | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x0 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* AAD Format with 64-bit Extended Sequence Number +* +* aadLen: +* from the definition of the spec, aadLen can only be 8 or 12 bytes. +* The code supports 16 too but for other sizes, the code will fail. +* +* TLen: +* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +* For other sizes, the code will fail. +* +* poly = x^128 + x^127 + x^126 + x^121 + 1 +* +*****************************************************************************/ +ENTRY(aesni_gcm_dec) + push %r12 + push %r13 + push %r14 + mov %rsp, %r14 +/* +* states of %xmm registers %xmm6:%xmm15 not saved +* all %xmm registers are clobbered +*/ + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp # align rsp to 64 bytes + mov %arg6, %r12 + movdqu (%r12), %xmm13 # %xmm13 = HashKey + movdqa SHUF_MASK(%rip), %xmm2 + PSHUFB_XMM %xmm2, %xmm13 + + +# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH) + + movdqa %xmm13, %xmm2 + psllq $1, %xmm13 + psrlq $63, %xmm2 + movdqa %xmm2, %xmm1 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + por %xmm2, %xmm13 + + # Reduction + + pshufd $0x24, %xmm1, %xmm2 + pcmpeqd TWOONE(%rip), %xmm2 + pand POLY(%rip), %xmm2 + pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly) + + + # Decrypt first few blocks + + movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly) + mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext + and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) + mov %r13, %r12 + and $(3<<4), %r12 + jz _initial_num_blocks_is_0_decrypt + cmp $(2<<4), %r12 + jb _initial_num_blocks_is_1_decrypt + je _initial_num_blocks_is_2_decrypt +_initial_num_blocks_is_3_decrypt: + INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec + sub $48, %r13 + jmp _initial_blocks_decrypted +_initial_num_blocks_is_2_decrypt: + INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec + sub $32, %r13 + jmp _initial_blocks_decrypted +_initial_num_blocks_is_1_decrypt: + INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec + sub $16, %r13 + jmp _initial_blocks_decrypted +_initial_num_blocks_is_0_decrypt: + INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec +_initial_blocks_decrypted: + cmp $0, %r13 + je _zero_cipher_left_decrypt + sub $64, %r13 + je _four_cipher_left_decrypt +_decrypt_by_4: + GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ +%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec + add $64, %r11 + sub $64, %r13 + jne _decrypt_by_4 +_four_cipher_left_decrypt: + GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ +%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 +_zero_cipher_left_decrypt: + mov %arg4, %r13 + and $15, %r13 # %r13 = arg4 (mod 16) + je _multiple_of_16_bytes_decrypt + + # Handle the last <16 byte block seperately + + paddd ONE(%rip), %xmm0 # increment CNT to get Yn + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm0 + + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn) + sub $16, %r11 + add %r13, %r11 + movdqu (%arg3,%r11,1), %xmm1 # recieve the last <16 byte block + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 +# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes +# (%r13 is the number of bytes in plaintext mod 16) + movdqu (%r12), %xmm2 # get the appropriate shuffle mask + PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes + + movdqa %xmm1, %xmm2 + pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn) + movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0 + pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0 + pand %xmm1, %xmm2 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10 ,%xmm2 + + pxor %xmm2, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # GHASH computation for the last <16 byte block + sub %r13, %r11 + add $16, %r11 + + # output %r13 bytes + MOVQ_R64_XMM %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_decrypt + mov %rax, (%arg2 , %r11, 1) + add $8, %r11 + psrldq $8, %xmm0 + MOVQ_R64_XMM %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_decrypt: + mov %al, (%arg2, %r11, 1) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_decrypt +_multiple_of_16_bytes_decrypt: + mov arg8, %r12 # %r13 = aadLen (number of bytes) + shl $3, %r12 # convert into number of bits + movd %r12d, %xmm15 # len(A) in %xmm15 + shl $3, %arg4 # len(C) in bits (*128) + MOVQ_R64_XMM %arg4, %xmm1 + pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 + pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) + pxor %xmm15, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # final GHASH computation + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm8 + + mov %arg5, %rax # %rax = *Y0 + movdqu (%rax), %xmm0 # %xmm0 = Y0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) + pxor %xmm8, %xmm0 +_return_T_decrypt: + mov arg9, %r10 # %r10 = authTag + mov arg10, %r11 # %r11 = auth_tag_len + cmp $16, %r11 + je _T_16_decrypt + cmp $12, %r11 + je _T_12_decrypt +_T_8_decrypt: + MOVQ_R64_XMM %xmm0, %rax + mov %rax, (%r10) + jmp _return_T_done_decrypt +_T_12_decrypt: + MOVQ_R64_XMM %xmm0, %rax + mov %rax, (%r10) + psrldq $8, %xmm0 + movd %xmm0, %eax + mov %eax, 8(%r10) + jmp _return_T_done_decrypt +_T_16_decrypt: + movdqu %xmm0, (%r10) +_return_T_done_decrypt: + mov %r14, %rsp + pop %r14 + pop %r13 + pop %r12 + ret +ENDPROC(aesni_gcm_dec) + + +/***************************************************************************** +* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +* u8 *out, // Ciphertext output. Encrypt in-place is allowed. +* const u8 *in, // Plaintext input +* u64 plaintext_len, // Length of data in bytes for encryption. +* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) +* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) +* // concatenated with 0x00000001. 16-byte aligned pointer. +* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. +* const u8 *aad, // Additional Authentication Data (AAD) +* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes +* u8 *auth_tag, // Authenticated Tag output. +* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), +* // 12 or 8. +* +* Assumptions: +* +* keys: +* keys are pre-expanded and aligned to 16 bytes. we are using the +* first set of 11 keys in the data structure void *aes_ctx +* +* +* iv: +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | Salt (From the SA) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | Initialization Vector | +* | (This is the sequence number from IPSec header) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x1 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* +* +* AAD: +* AAD padded to 128 bits with 0 +* for example, assume AAD is a u32 vector +* +* if AAD is 8 bytes: +* AAD[3] = {A0, A1}; +* padded AAD in xmm register = {A1 A0 0 0} +* +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | SPI (A1) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 32-bit Sequence Number (A0) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x0 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* AAD Format with 32-bit Sequence Number +* +* if AAD is 12 bytes: +* AAD[3] = {A0, A1, A2}; +* padded AAD in xmm register = {A2 A1 A0 0} +* +* 0 1 2 3 +* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | SPI (A2) | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 64-bit Extended Sequence Number {A1,A0} | +* | | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* | 0x0 | +* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +* +* AAD Format with 64-bit Extended Sequence Number +* +* aadLen: +* from the definition of the spec, aadLen can only be 8 or 12 bytes. +* The code supports 16 too but for other sizes, the code will fail. +* +* TLen: +* from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +* For other sizes, the code will fail. +* +* poly = x^128 + x^127 + x^126 + x^121 + 1 +***************************************************************************/ +ENTRY(aesni_gcm_enc) + push %r12 + push %r13 + push %r14 + mov %rsp, %r14 +# +# states of %xmm registers %xmm6:%xmm15 not saved +# all %xmm registers are clobbered +# + sub $VARIABLE_OFFSET, %rsp + and $~63, %rsp + mov %arg6, %r12 + movdqu (%r12), %xmm13 + movdqa SHUF_MASK(%rip), %xmm2 + PSHUFB_XMM %xmm2, %xmm13 + + +# precompute HashKey<<1 mod poly from the HashKey (required for GHASH) + + movdqa %xmm13, %xmm2 + psllq $1, %xmm13 + psrlq $63, %xmm2 + movdqa %xmm2, %xmm1 + pslldq $8, %xmm2 + psrldq $8, %xmm1 + por %xmm2, %xmm13 + + # reduce HashKey<<1 + + pshufd $0x24, %xmm1, %xmm2 + pcmpeqd TWOONE(%rip), %xmm2 + pand POLY(%rip), %xmm2 + pxor %xmm2, %xmm13 + movdqa %xmm13, HashKey(%rsp) + mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly) + and $-16, %r13 + mov %r13, %r12 + + # Encrypt first few blocks + + and $(3<<4), %r12 + jz _initial_num_blocks_is_0_encrypt + cmp $(2<<4), %r12 + jb _initial_num_blocks_is_1_encrypt + je _initial_num_blocks_is_2_encrypt +_initial_num_blocks_is_3_encrypt: + INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc + sub $48, %r13 + jmp _initial_blocks_encrypted +_initial_num_blocks_is_2_encrypt: + INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc + sub $32, %r13 + jmp _initial_blocks_encrypted +_initial_num_blocks_is_1_encrypt: + INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc + sub $16, %r13 + jmp _initial_blocks_encrypted +_initial_num_blocks_is_0_encrypt: + INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc +_initial_blocks_encrypted: + + # Main loop - Encrypt remaining blocks + + cmp $0, %r13 + je _zero_cipher_left_encrypt + sub $64, %r13 + je _four_cipher_left_encrypt +_encrypt_by_4_encrypt: + GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \ +%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc + add $64, %r11 + sub $64, %r13 + jne _encrypt_by_4_encrypt +_four_cipher_left_encrypt: + GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ +%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 +_zero_cipher_left_encrypt: + mov %arg4, %r13 + and $15, %r13 # %r13 = arg4 (mod 16) + je _multiple_of_16_bytes_encrypt + + # Handle the last <16 Byte block seperately + paddd ONE(%rip), %xmm0 # INCR CNT to get Yn + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm0 + + + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) + sub $16, %r11 + add %r13, %r11 + movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks + lea SHIFT_MASK+16(%rip), %r12 + sub %r13, %r12 + # adjust the shuffle mask pointer to be able to shift 16-r13 bytes + # (%r13 is the number of bytes in plaintext mod 16) + movdqu (%r12), %xmm2 # get the appropriate shuffle mask + PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte + pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn) + movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 + # get the appropriate mask to mask out top 16-r13 bytes of xmm0 + pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10,%xmm0 + + pxor %xmm0, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # GHASH computation for the last <16 byte block + sub %r13, %r11 + add $16, %r11 + + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm0 + + # shuffle xmm0 back to output as ciphertext + + # Output %r13 bytes + MOVQ_R64_XMM %xmm0, %rax + cmp $8, %r13 + jle _less_than_8_bytes_left_encrypt + mov %rax, (%arg2 , %r11, 1) + add $8, %r11 + psrldq $8, %xmm0 + MOVQ_R64_XMM %xmm0, %rax + sub $8, %r13 +_less_than_8_bytes_left_encrypt: + mov %al, (%arg2, %r11, 1) + add $1, %r11 + shr $8, %rax + sub $1, %r13 + jne _less_than_8_bytes_left_encrypt +_multiple_of_16_bytes_encrypt: + mov arg8, %r12 # %r12 = addLen (number of bytes) + shl $3, %r12 + movd %r12d, %xmm15 # len(A) in %xmm15 + shl $3, %arg4 # len(C) in bits (*128) + MOVQ_R64_XMM %arg4, %xmm1 + pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 + pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) + pxor %xmm15, %xmm8 + GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 + # final GHASH computation + movdqa SHUF_MASK(%rip), %xmm10 + PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap + + mov %arg5, %rax # %rax = *Y0 + movdqu (%rax), %xmm0 # %xmm0 = Y0 + ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0) + pxor %xmm8, %xmm0 +_return_T_encrypt: + mov arg9, %r10 # %r10 = authTag + mov arg10, %r11 # %r11 = auth_tag_len + cmp $16, %r11 + je _T_16_encrypt + cmp $12, %r11 + je _T_12_encrypt +_T_8_encrypt: + MOVQ_R64_XMM %xmm0, %rax + mov %rax, (%r10) + jmp _return_T_done_encrypt +_T_12_encrypt: + MOVQ_R64_XMM %xmm0, %rax + mov %rax, (%r10) + psrldq $8, %xmm0 + movd %xmm0, %eax + mov %eax, 8(%r10) + jmp _return_T_done_encrypt +_T_16_encrypt: + movdqu %xmm0, (%r10) +_return_T_done_encrypt: + mov %r14, %rsp + pop %r14 + pop %r13 + pop %r12 + ret +ENDPROC(aesni_gcm_enc) + +#endif + + +.align 4 _key_expansion_128: _key_expansion_256a: pshufd $0b11111111, %xmm1, %xmm1 @@ -50,10 +1731,13 @@ _key_expansion_256a: shufps $0b10001100, %xmm0, %xmm4 pxor %xmm4, %xmm0 pxor %xmm1, %xmm0 - movaps %xmm0, (%rcx) - add $0x10, %rcx + movaps %xmm0, (TKEYP) + add $0x10, TKEYP ret +ENDPROC(_key_expansion_128) +ENDPROC(_key_expansion_256a) +.align 4 _key_expansion_192a: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 @@ -71,12 +1755,14 @@ _key_expansion_192a: movaps %xmm0, %xmm1 shufps $0b01000100, %xmm0, %xmm6 - movaps %xmm6, (%rcx) + movaps %xmm6, (TKEYP) shufps $0b01001110, %xmm2, %xmm1 - movaps %xmm1, 16(%rcx) - add $0x20, %rcx + movaps %xmm1, 0x10(TKEYP) + add $0x20, TKEYP ret +ENDPROC(_key_expansion_192a) +.align 4 _key_expansion_192b: pshufd $0b01010101, %xmm1, %xmm1 shufps $0b00010000, %xmm0, %xmm4 @@ -91,10 +1777,12 @@ _key_expansion_192b: pxor %xmm3, %xmm2 pxor %xmm5, %xmm2 - movaps %xmm0, (%rcx) - add $0x10, %rcx + movaps %xmm0, (TKEYP) + add $0x10, TKEYP ret +ENDPROC(_key_expansion_192b) +.align 4 _key_expansion_256b: pshufd $0b10101010, %xmm1, %xmm1 shufps $0b00010000, %xmm2, %xmm4 @@ -102,154 +1790,145 @@ _key_expansion_256b: shufps $0b10001100, %xmm2, %xmm4 pxor %xmm4, %xmm2 pxor %xmm1, %xmm2 - movaps %xmm2, (%rcx) - add $0x10, %rcx + movaps %xmm2, (TKEYP) + add $0x10, TKEYP ret +ENDPROC(_key_expansion_256b) /* * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, * unsigned int key_len) */ ENTRY(aesni_set_key) - movups (%rsi), %xmm0 # user key (first 16 bytes) - movaps %xmm0, (%rdi) - lea 0x10(%rdi), %rcx # key addr - movl %edx, 480(%rdi) +#ifndef __x86_64__ + pushl KEYP + movl 8(%esp), KEYP # ctx + movl 12(%esp), UKEYP # in_key + movl 16(%esp), %edx # key_len +#endif + movups (UKEYP), %xmm0 # user key (first 16 bytes) + movaps %xmm0, (KEYP) + lea 0x10(KEYP), TKEYP # key addr + movl %edx, 480(KEYP) pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x cmp $24, %dl jb .Lenc_key128 je .Lenc_key192 - movups 0x10(%rsi), %xmm2 # other user key - movaps %xmm2, (%rcx) - add $0x10, %rcx - # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + movups 0x10(UKEYP), %xmm2 # other user key + movaps %xmm2, (TKEYP) + add $0x10, TKEYP + AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_256a - # aeskeygenassist $0x1, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + AESKEYGENASSIST 0x1 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 call _key_expansion_256a - # aeskeygenassist $0x2, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + AESKEYGENASSIST 0x2 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 call _key_expansion_256a - # aeskeygenassist $0x4, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + AESKEYGENASSIST 0x4 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 call _key_expansion_256a - # aeskeygenassist $0x8, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + AESKEYGENASSIST 0x8 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 call _key_expansion_256a - # aeskeygenassist $0x10, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + AESKEYGENASSIST 0x10 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 call _key_expansion_256a - # aeskeygenassist $0x20, %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + AESKEYGENASSIST 0x20 %xmm0 %xmm1 call _key_expansion_256b - # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 call _key_expansion_256a jmp .Ldec_key .Lenc_key192: - movq 0x10(%rsi), %xmm2 # other user key - # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01 + movq 0x10(UKEYP), %xmm2 # other user key + AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 call _key_expansion_192a - # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02 + AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 call _key_expansion_192b - # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04 + AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3 call _key_expansion_192a - # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08 + AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4 call _key_expansion_192b - # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10 + AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5 call _key_expansion_192a - # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20 + AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6 call _key_expansion_192b - # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40 + AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7 call _key_expansion_192a - # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80 + AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8 call _key_expansion_192b jmp .Ldec_key .Lenc_key128: - # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01 + AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1 call _key_expansion_128 - # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02 + AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2 call _key_expansion_128 - # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04 + AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3 call _key_expansion_128 - # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08 + AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4 call _key_expansion_128 - # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10 + AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5 call _key_expansion_128 - # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20 + AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6 call _key_expansion_128 - # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40 + AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7 call _key_expansion_128 - # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80 + AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8 call _key_expansion_128 - # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b + AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9 call _key_expansion_128 - # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 - .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36 + AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 call _key_expansion_128 .Ldec_key: - sub $0x10, %rcx - movaps (%rdi), %xmm0 - movaps (%rcx), %xmm1 - movaps %xmm0, 240(%rcx) - movaps %xmm1, 240(%rdi) - add $0x10, %rdi - lea 240-16(%rcx), %rsi + sub $0x10, TKEYP + movaps (KEYP), %xmm0 + movaps (TKEYP), %xmm1 + movaps %xmm0, 240(TKEYP) + movaps %xmm1, 240(KEYP) + add $0x10, KEYP + lea 240-16(TKEYP), UKEYP .align 4 .Ldec_key_loop: - movaps (%rdi), %xmm0 - # aesimc %xmm0, %xmm1 - .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8 - movaps %xmm1, (%rsi) - add $0x10, %rdi - sub $0x10, %rsi - cmp %rcx, %rdi + movaps (KEYP), %xmm0 + AESIMC %xmm0 %xmm1 + movaps %xmm1, (UKEYP) + add $0x10, KEYP + sub $0x10, UKEYP + cmp TKEYP, KEYP jb .Ldec_key_loop - xor %rax, %rax + xor AREG, AREG +#ifndef __x86_64__ + popl KEYP +#endif ret +ENDPROC(aesni_set_key) /* * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ ENTRY(aesni_enc) +#ifndef __x86_64__ + pushl KEYP + pushl KLEN + movl 12(%esp), KEYP + movl 16(%esp), OUTP + movl 20(%esp), INP +#endif movl 480(KEYP), KLEN # key length movups (INP), STATE # input call _aesni_enc1 movups STATE, (OUTP) # output +#ifndef __x86_64__ + popl KLEN + popl KEYP +#endif ret +ENDPROC(aesni_enc) /* * _aesni_enc1: internal ABI @@ -263,6 +1942,7 @@ ENTRY(aesni_enc) * KEY * TKEYP (T1) */ +.align 4 _aesni_enc1: movaps (KEYP), KEY # key mov KEYP, TKEYP @@ -274,52 +1954,39 @@ _aesni_enc1: je .Lenc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x50(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE .align 4 .Lenc192: movaps -0x40(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x30(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE .align 4 .Lenc128: movaps -0x20(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps -0x10(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps (TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x10(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x20(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x30(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x40(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x50(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x60(TKEYP), KEY - # aesenc KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 + AESENC KEY STATE movaps 0x70(TKEYP), KEY - # aesenclast KEY, STATE # last round - .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 + AESENCLAST KEY STATE ret +ENDPROC(_aesni_enc1) /* * _aesni_enc4: internal ABI @@ -339,6 +2006,7 @@ _aesni_enc1: * KEY * TKEYP (T1) */ +.align 4 _aesni_enc4: movaps (KEYP), KEY # key mov KEYP, TKEYP @@ -353,147 +2021,104 @@ _aesni_enc4: je .L4enc192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x50(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 #.align 4 .L4enc192: movaps -0x40(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x30(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 #.align 4 .L4enc128: movaps -0x20(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps -0x10(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps (TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x10(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x20(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x30(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x40(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x50(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x60(TKEYP), KEY - # aesenc KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 - # aesenc KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 - # aesenc KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xea - # aesenc KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2 + AESENC KEY STATE1 + AESENC KEY STATE2 + AESENC KEY STATE3 + AESENC KEY STATE4 movaps 0x70(TKEYP), KEY - # aesenclast KEY, STATE1 # last round - .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 - # aesenclast KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 - # aesenclast KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xea - # aesenclast KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2 + AESENCLAST KEY STATE1 # last round + AESENCLAST KEY STATE2 + AESENCLAST KEY STATE3 + AESENCLAST KEY STATE4 ret +ENDPROC(_aesni_enc4) /* * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) */ ENTRY(aesni_dec) +#ifndef __x86_64__ + pushl KEYP + pushl KLEN + movl 12(%esp), KEYP + movl 16(%esp), OUTP + movl 20(%esp), INP +#endif mov 480(KEYP), KLEN # key length add $240, KEYP movups (INP), STATE # input call _aesni_dec1 movups STATE, (OUTP) #output +#ifndef __x86_64__ + popl KLEN + popl KEYP +#endif ret +ENDPROC(aesni_dec) /* * _aesni_dec1: internal ABI @@ -507,6 +2132,7 @@ ENTRY(aesni_dec) * KEY * TKEYP (T1) */ +.align 4 _aesni_dec1: movaps (KEYP), KEY # key mov KEYP, TKEYP @@ -518,52 +2144,39 @@ _aesni_dec1: je .Ldec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x50(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE .align 4 .Ldec192: movaps -0x40(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x30(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE .align 4 .Ldec128: movaps -0x20(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps -0x10(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps (TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x10(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x20(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x30(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x40(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x50(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x60(TKEYP), KEY - # aesdec KEY, STATE - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 + AESDEC KEY STATE movaps 0x70(TKEYP), KEY - # aesdeclast KEY, STATE # last round - .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 + AESDECLAST KEY STATE ret +ENDPROC(_aesni_dec1) /* * _aesni_dec4: internal ABI @@ -583,6 +2196,7 @@ _aesni_dec1: * KEY * TKEYP (T1) */ +.align 4 _aesni_dec4: movaps (KEYP), KEY # key mov KEYP, TKEYP @@ -597,142 +2211,96 @@ _aesni_dec4: je .L4dec192 add $0x20, TKEYP movaps -0x60(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x50(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 .align 4 .L4dec192: movaps -0x40(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x30(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 .align 4 .L4dec128: movaps -0x20(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps -0x10(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps (TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x10(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x20(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x30(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x40(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x50(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x60(TKEYP), KEY - # aesdec KEY, STATE1 - .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 - # aesdec KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 - # aesdec KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xde, 0xea - # aesdec KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xde, 0xf2 + AESDEC KEY STATE1 + AESDEC KEY STATE2 + AESDEC KEY STATE3 + AESDEC KEY STATE4 movaps 0x70(TKEYP), KEY - # aesdeclast KEY, STATE1 # last round - .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 - # aesdeclast KEY, STATE2 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 - # aesdeclast KEY, STATE3 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xea - # aesdeclast KEY, STATE4 - .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2 + AESDECLAST KEY STATE1 # last round + AESDECLAST KEY STATE2 + AESDECLAST KEY STATE3 + AESDECLAST KEY STATE4 ret +ENDPROC(_aesni_dec4) /* * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len) */ ENTRY(aesni_ecb_enc) +#ifndef __x86_64__ + pushl LEN + pushl KEYP + pushl KLEN + movl 16(%esp), KEYP + movl 20(%esp), OUTP + movl 24(%esp), INP + movl 28(%esp), LEN +#endif test LEN, LEN # check length jz .Lecb_enc_ret mov 480(KEYP), KLEN @@ -769,13 +2337,28 @@ ENTRY(aesni_ecb_enc) cmp $16, LEN jge .Lecb_enc_loop1 .Lecb_enc_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN +#endif ret +ENDPROC(aesni_ecb_enc) /* * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len); */ ENTRY(aesni_ecb_dec) +#ifndef __x86_64__ + pushl LEN + pushl KEYP + pushl KLEN + movl 16(%esp), KEYP + movl 20(%esp), OUTP + movl 24(%esp), INP + movl 28(%esp), LEN +#endif test LEN, LEN jz .Lecb_dec_ret mov 480(KEYP), KLEN @@ -813,13 +2396,30 @@ ENTRY(aesni_ecb_dec) cmp $16, LEN jge .Lecb_dec_loop1 .Lecb_dec_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN +#endif ret +ENDPROC(aesni_ecb_dec) /* * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ ENTRY(aesni_cbc_enc) +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl 20(%esp), KEYP + movl 24(%esp), OUTP + movl 28(%esp), INP + movl 32(%esp), LEN + movl 36(%esp), IVP +#endif cmp $16, LEN jb .Lcbc_enc_ret mov 480(KEYP), KLEN @@ -837,13 +2437,31 @@ ENTRY(aesni_cbc_enc) jge .Lcbc_enc_loop movups STATE, (IVP) .Lcbc_enc_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif ret +ENDPROC(aesni_cbc_enc) /* * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, * size_t len, u8 *iv) */ ENTRY(aesni_cbc_dec) +#ifndef __x86_64__ + pushl IVP + pushl LEN + pushl KEYP + pushl KLEN + movl 20(%esp), KEYP + movl 24(%esp), OUTP + movl 28(%esp), INP + movl 32(%esp), LEN + movl 36(%esp), IVP +#endif cmp $16, LEN jb .Lcbc_dec_just_ret mov 480(KEYP), KLEN @@ -857,16 +2475,32 @@ ENTRY(aesni_cbc_dec) movaps IN1, STATE1 movups 0x10(INP), IN2 movaps IN2, STATE2 +#ifdef __x86_64__ movups 0x20(INP), IN3 movaps IN3, STATE3 movups 0x30(INP), IN4 movaps IN4, STATE4 +#else + movups 0x20(INP), IN1 + movaps IN1, STATE3 + movups 0x30(INP), IN2 + movaps IN2, STATE4 +#endif call _aesni_dec4 pxor IV, STATE1 +#ifdef __x86_64__ pxor IN1, STATE2 pxor IN2, STATE3 pxor IN3, STATE4 movaps IN4, IV +#else + pxor IN1, STATE4 + movaps IN2, IV + movups (INP), IN1 + pxor IN1, STATE2 + movups 0x10(INP), IN2 + pxor IN2, STATE3 +#endif movups STATE1, (OUTP) movups STATE2, 0x10(OUTP) movups STATE3, 0x20(OUTP) @@ -894,4 +2528,240 @@ ENTRY(aesni_cbc_dec) .Lcbc_dec_ret: movups IV, (IVP) .Lcbc_dec_just_ret: +#ifndef __x86_64__ + popl KLEN + popl KEYP + popl LEN + popl IVP +#endif + ret +ENDPROC(aesni_cbc_dec) + +#ifdef __x86_64__ +.align 16 +.Lbswap_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +/* + * _aesni_inc_init: internal ABI + * setup registers used by _aesni_inc + * input: + * IV + * output: + * CTR: == IV, in little endian + * TCTR_LOW: == lower qword of CTR + * INC: == 1, in little endian + * BSWAP_MASK == endian swapping mask + */ +.align 4 +_aesni_inc_init: + movaps .Lbswap_mask, BSWAP_MASK + movaps IV, CTR + PSHUFB_XMM BSWAP_MASK CTR + mov $1, TCTR_LOW + MOVQ_R64_XMM TCTR_LOW INC + MOVQ_R64_XMM CTR TCTR_LOW + ret +ENDPROC(_aesni_inc_init) + +/* + * _aesni_inc: internal ABI + * Increase IV by 1, IV is in big endian + * input: + * IV + * CTR: == IV, in little endian + * TCTR_LOW: == lower qword of CTR + * INC: == 1, in little endian + * BSWAP_MASK == endian swapping mask + * output: + * IV: Increase by 1 + * changed: + * CTR: == output IV, in little endian + * TCTR_LOW: == lower qword of CTR + */ +.align 4 +_aesni_inc: + paddq INC, CTR + add $1, TCTR_LOW + jnc .Linc_low + pslldq $8, INC + paddq INC, CTR + psrldq $8, INC +.Linc_low: + movaps CTR, IV + PSHUFB_XMM BSWAP_MASK IV ret +ENDPROC(_aesni_inc) + +/* + * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * size_t len, u8 *iv) + */ +ENTRY(aesni_ctr_enc) + cmp $16, LEN + jb .Lctr_enc_just_ret + mov 480(KEYP), KLEN + movups (IVP), IV + call _aesni_inc_init + cmp $64, LEN + jb .Lctr_enc_loop1 +.align 4 +.Lctr_enc_loop4: + movaps IV, STATE1 + call _aesni_inc + movups (INP), IN1 + movaps IV, STATE2 + call _aesni_inc + movups 0x10(INP), IN2 + movaps IV, STATE3 + call _aesni_inc + movups 0x20(INP), IN3 + movaps IV, STATE4 + call _aesni_inc + movups 0x30(INP), IN4 + call _aesni_enc4 + pxor IN1, STATE1 + movups STATE1, (OUTP) + pxor IN2, STATE2 + movups STATE2, 0x10(OUTP) + pxor IN3, STATE3 + movups STATE3, 0x20(OUTP) + pxor IN4, STATE4 + movups STATE4, 0x30(OUTP) + sub $64, LEN + add $64, INP + add $64, OUTP + cmp $64, LEN + jge .Lctr_enc_loop4 + cmp $16, LEN + jb .Lctr_enc_ret +.align 4 +.Lctr_enc_loop1: + movaps IV, STATE + call _aesni_inc + movups (INP), IN + call _aesni_enc1 + pxor IN, STATE + movups STATE, (OUTP) + sub $16, LEN + add $16, INP + add $16, OUTP + cmp $16, LEN + jge .Lctr_enc_loop1 +.Lctr_enc_ret: + movups IV, (IVP) +.Lctr_enc_just_ret: + ret +ENDPROC(aesni_ctr_enc) + +/* + * _aesni_gf128mul_x_ble: internal ABI + * Multiply in GF(2^128) for XTS IVs + * input: + * IV: current IV + * GF128MUL_MASK == mask with 0x87 and 0x01 + * output: + * IV: next IV + * changed: + * CTR: == temporary value + */ +#define _aesni_gf128mul_x_ble() \ + pshufd $0x13, IV, CTR; \ + paddq IV, IV; \ + psrad $31, CTR; \ + pand GF128MUL_MASK, CTR; \ + pxor CTR, IV; + +/* + * void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, + * bool enc, u8 *iv) + */ +ENTRY(aesni_xts_crypt8) + cmpb $0, %cl + movl $0, %ecx + movl $240, %r10d + leaq _aesni_enc4, %r11 + leaq _aesni_dec4, %rax + cmovel %r10d, %ecx + cmoveq %rax, %r11 + + movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK + movups (IVP), IV + + mov 480(KEYP), KLEN + addq %rcx, KEYP + + movdqa IV, STATE1 + pxor 0x00(INP), STATE1 + movdqu IV, 0x00(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE2 + pxor 0x10(INP), STATE2 + movdqu IV, 0x10(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + pxor 0x20(INP), STATE3 + movdqu IV, 0x20(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE4 + pxor 0x30(INP), STATE4 + movdqu IV, 0x30(OUTP) + + call *%r11 + + pxor 0x00(OUTP), STATE1 + movdqu STATE1, 0x00(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE1 + pxor 0x40(INP), STATE1 + movdqu IV, 0x40(OUTP) + + pxor 0x10(OUTP), STATE2 + movdqu STATE2, 0x10(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE2 + pxor 0x50(INP), STATE2 + movdqu IV, 0x50(OUTP) + + pxor 0x20(OUTP), STATE3 + movdqu STATE3, 0x20(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE3 + pxor 0x60(INP), STATE3 + movdqu IV, 0x60(OUTP) + + pxor 0x30(OUTP), STATE4 + movdqu STATE4, 0x30(OUTP) + + _aesni_gf128mul_x_ble() + movdqa IV, STATE4 + pxor 0x70(INP), STATE4 + movdqu IV, 0x70(OUTP) + + _aesni_gf128mul_x_ble() + movups IV, (IVP) + + call *%r11 + + pxor 0x40(OUTP), STATE1 + movdqu STATE1, 0x40(OUTP) + + pxor 0x50(OUTP), STATE2 + movdqu STATE2, 0x50(OUTP) + + pxor 0x60(OUTP), STATE3 + movdqu STATE3, 0x60(OUTP) + + pxor 0x70(OUTP), STATE4 + movdqu STATE4, 0x70(OUTP) + + ret +ENDPROC(aesni_xts_crypt8) + +#endif diff -uprN linux-2.6.32/arch/x86/crypto/aesni-intel_glue.c linux-2.6.32.ovz/arch/x86/crypto/aesni-intel_glue.c --- linux-2.6.32/arch/x86/crypto/aesni-intel_glue.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/aesni-intel_glue.c 2015-06-23 04:16:16.000000000 -0700 @@ -5,6 +5,14 @@ * Copyright (C) 2008, Intel Corp. * Author: Huang Ying * + * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD + * interface for 64-bit kernels. + * Authors: Adrian Hoban + * Gabriele Paoloni + * Tadeusz Struk (tadeusz.struk@intel.com) + * Aidan O'Mahony (aidan.o.mahony@intel.com) + * Copyright (c) 2010, Intel Corporation. + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -18,31 +26,61 @@ #include #include #include +#include +#include +#include +#include #include #include - -#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) -#define HAS_CTR -#endif - -#if defined(CONFIG_CRYPTO_LRW) || defined(CONFIG_CRYPTO_LRW_MODULE) -#define HAS_LRW +#include +#include +#include +#include +#include +#ifdef CONFIG_X86_64 +#include #endif #if defined(CONFIG_CRYPTO_PCBC) || defined(CONFIG_CRYPTO_PCBC_MODULE) #define HAS_PCBC #endif -#if defined(CONFIG_CRYPTO_XTS) || defined(CONFIG_CRYPTO_XTS_MODULE) -#define HAS_XTS -#endif -struct async_aes_ctx { - struct cryptd_ablkcipher *cryptd_tfm; +/* This data is stored at the end of the crypto_tfm struct. + * It's a type of per "session" data storage location. + * This needs to be 16 byte aligned. + */ +struct aesni_rfc4106_gcm_ctx { + u8 hash_subkey[16]; + struct crypto_aes_ctx aes_key_expanded; + u8 nonce[4]; + struct cryptd_aead *cryptd_tfm; +}; + +struct aesni_gcm_set_hash_subkey_result { + int err; + struct completion completion; +}; + +struct aesni_hash_subkey_req_data { + u8 iv[16]; + struct aesni_gcm_set_hash_subkey_result result; + struct scatterlist sg; }; -#define AESNI_ALIGN 16 +#define AESNI_ALIGN (16) #define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) +#define RFC4106_HASH_SUBKEY_SIZE 16 + +struct aesni_lrw_ctx { + struct lrw_table_ctx lrw_table; + u8 raw_aes_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; +}; + +struct aesni_xts_ctx { + u8 raw_tweak_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; + u8 raw_crypt_ctx[sizeof(struct crypto_aes_ctx) + AESNI_ALIGN - 1]; +}; asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, unsigned int key_len); @@ -59,6 +97,68 @@ asmlinkage void aesni_cbc_enc(struct cry asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); +int crypto_fpu_init(void); +void crypto_fpu_exit(void); + +#ifdef CONFIG_X86_64 +asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + +asmlinkage void aesni_xts_crypt8(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, bool enc, u8 *iv); + +/* asmlinkage void aesni_gcm_enc() + * void *ctx, AES Key schedule. Starts on a 16 byte boundary. + * u8 *out, Ciphertext output. Encrypt in-place is allowed. + * const u8 *in, Plaintext input + * unsigned long plaintext_len, Length of data in bytes for encryption. + * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) + * concatenated with 8 byte Initialisation Vector (from IPSec ESP + * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + * const u8 *aad, Additional Authentication Data (AAD) + * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this + * is going to be 8 or 12 bytes + * u8 *auth_tag, Authenticated Tag output. + * unsigned long auth_tag_len), Authenticated Tag Length in bytes. + * Valid values are 16 (most likely), 12 or 8. + */ +asmlinkage void aesni_gcm_enc(void *ctx, u8 *out, + const u8 *in, unsigned long plaintext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +/* asmlinkage void aesni_gcm_dec() + * void *ctx, AES Key schedule. Starts on a 16 byte boundary. + * u8 *out, Plaintext output. Decrypt in-place is allowed. + * const u8 *in, Ciphertext input + * unsigned long ciphertext_len, Length of data in bytes for decryption. + * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association) + * concatenated with 8 byte Initialisation Vector (from IPSec ESP + * Payload) concatenated with 0x00000001. 16-byte aligned pointer. + * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. + * const u8 *aad, Additional Authentication Data (AAD) + * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going + * to be 8 or 12 bytes + * u8 *auth_tag, Authenticated Tag output. + * unsigned long auth_tag_len) Authenticated Tag Length in bytes. + * Valid values are 16 (most likely), 12 or 8. + */ +asmlinkage void aesni_gcm_dec(void *ctx, u8 *out, + const u8 *in, unsigned long ciphertext_len, u8 *iv, + u8 *hash_subkey, const u8 *aad, unsigned long aad_len, + u8 *auth_tag, unsigned long auth_tag_len); + +static inline struct +aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) +{ + return + (struct aesni_rfc4106_gcm_ctx *) + PTR_ALIGN((u8 *) + crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN); +} +#endif + static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) { unsigned long addr = (unsigned long)raw_ctx; @@ -125,27 +225,6 @@ static void aes_decrypt(struct crypto_tf } } -static struct crypto_alg aesni_alg = { - .cra_name = "aes", - .cra_driver_name = "aes-aesni", - .cra_priority = 300, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(aesni_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = aes_encrypt, - .cia_decrypt = aes_decrypt - } - } -}; - static void __aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); @@ -160,27 +239,6 @@ static void __aes_decrypt(struct crypto_ aesni_dec(ctx, dst, src); } -static struct crypto_alg __aesni_alg = { - .cra_name = "__aes-aesni", - .cra_driver_name = "__driver-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_CIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(__aesni_alg.cra_list), - .cra_u = { - .cipher = { - .cia_min_keysize = AES_MIN_KEY_SIZE, - .cia_max_keysize = AES_MAX_KEY_SIZE, - .cia_setkey = aes_set_key, - .cia_encrypt = __aes_encrypt, - .cia_decrypt = __aes_decrypt - } - } -}; - static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -229,28 +287,6 @@ static int ecb_decrypt(struct blkcipher_ return err; } -static struct crypto_alg blk_ecb_alg = { - .cra_name = "__ecb-aes-aesni", - .cra_driver_name = "__driver-ecb-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = ecb_encrypt, - .decrypt = ecb_decrypt, - }, - }, -}; - static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, struct scatterlist *src, unsigned int nbytes) @@ -299,277 +335,927 @@ static int cbc_decrypt(struct blkcipher_ return err; } -static struct crypto_alg blk_cbc_alg = { - .cra_name = "__cbc-aes-aesni", - .cra_driver_name = "__driver-cbc-aes-aesni", - .cra_priority = 0, - .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct crypto_aes_ctx)+AESNI_ALIGN-1, - .cra_alignmask = 0, - .cra_type = &crypto_blkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), - .cra_u = { - .blkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = aes_set_key, - .encrypt = cbc_encrypt, - .decrypt = cbc_decrypt, - }, - }, -}; - -static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, - unsigned int key_len) +#ifdef CONFIG_X86_64 +static void ctr_crypt_final(struct crypto_aes_ctx *ctx, + struct blkcipher_walk *walk) +{ + u8 *ctrblk = walk->iv; + u8 keystream[AES_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + aesni_enc(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + crypto_inc(ctrblk, AES_BLOCK_SIZE); +} + +static int ctr_crypt(struct blkcipher_desc *desc, + struct scatterlist *dst, struct scatterlist *src, + unsigned int nbytes) { - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); - struct crypto_ablkcipher *child = &ctx->cryptd_tfm->base; + struct crypto_aes_ctx *ctx = aes_ctx(crypto_blkcipher_ctx(desc->tfm)); + struct blkcipher_walk walk; int err; - crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(tfm) - & CRYPTO_TFM_REQ_MASK); - err = crypto_ablkcipher_setkey(child, key, key_len); - crypto_ablkcipher_set_flags(tfm, crypto_ablkcipher_get_flags(child) - & CRYPTO_TFM_RES_MASK); + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, AES_BLOCK_SIZE); + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { + aesni_ctr_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, + nbytes & AES_BLOCK_MASK, walk.iv); + nbytes &= AES_BLOCK_SIZE - 1; + err = blkcipher_walk_done(desc, &walk, nbytes); + } + if (walk.nbytes) { + ctr_crypt_final(ctx, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + kernel_fpu_end(); + return err; } +#endif -static int ablk_encrypt(struct ablkcipher_request *req) +static int ablk_ecb_init(struct crypto_tfm *tfm) { - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); + return ablk_init_common(tfm, "__driver-ecb-aes-aesni"); +} - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_ablkcipher_encrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - return crypto_blkcipher_crt(desc.tfm)->encrypt( - &desc, req->dst, req->src, req->nbytes); - } +static int ablk_cbc_init(struct crypto_tfm *tfm) +{ + return ablk_init_common(tfm, "__driver-cbc-aes-aesni"); } -static int ablk_decrypt(struct ablkcipher_request *req) +#ifdef CONFIG_X86_64 +static int ablk_ctr_init(struct crypto_tfm *tfm) { - struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); - struct async_aes_ctx *ctx = crypto_ablkcipher_ctx(tfm); + return ablk_init_common(tfm, "__driver-ctr-aes-aesni"); +} - if (!irq_fpu_usable()) { - struct ablkcipher_request *cryptd_req = - ablkcipher_request_ctx(req); - memcpy(cryptd_req, req, sizeof(*req)); - ablkcipher_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); - return crypto_ablkcipher_decrypt(cryptd_req); - } else { - struct blkcipher_desc desc; - desc.tfm = cryptd_ablkcipher_child(ctx->cryptd_tfm); - desc.info = req->info; - desc.flags = 0; - return crypto_blkcipher_crt(desc.tfm)->decrypt( - &desc, req->dst, req->src, req->nbytes); - } +#endif + +#ifdef HAS_PCBC +static int ablk_pcbc_init(struct crypto_tfm *tfm) +{ + return ablk_init_common(tfm, "fpu(pcbc(__driver-aes-aesni))"); } +#endif -static void ablk_exit(struct crypto_tfm *tfm) +static void lrw_xts_encrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) { - struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); + aesni_ecb_enc(ctx, blks, blks, nbytes); +} - cryptd_free_ablkcipher(ctx->cryptd_tfm); +static void lrw_xts_decrypt_callback(void *ctx, u8 *blks, unsigned int nbytes) +{ + aesni_ecb_dec(ctx, blks, blks, nbytes); } -static void ablk_init_common(struct crypto_tfm *tfm, - struct cryptd_ablkcipher *cryptd_tfm) +static int lrw_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) { - struct async_aes_ctx *ctx = crypto_tfm_ctx(tfm); + struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); + int err; - ctx->cryptd_tfm = cryptd_tfm; - tfm->crt_ablkcipher.reqsize = sizeof(struct ablkcipher_request) + - crypto_ablkcipher_reqsize(&cryptd_tfm->base); + err = aes_set_key_common(tfm, ctx->raw_aes_ctx, key, + keylen - AES_BLOCK_SIZE); + if (err) + return err; + + return lrw_init_table(&ctx->lrw_table, key + keylen - AES_BLOCK_SIZE); } -static int ablk_ecb_init(struct crypto_tfm *tfm) +static void lrw_aesni_exit_tfm(struct crypto_tfm *tfm) { - struct cryptd_ablkcipher *cryptd_tfm; + struct aesni_lrw_ctx *ctx = crypto_tfm_ctx(tfm); - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-ecb-aes-aesni", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + lrw_free_table(&ctx->lrw_table); } -static struct crypto_alg ablk_ecb_alg = { - .cra_name = "ecb(aes)", - .cra_driver_name = "ecb-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_ecb_alg.cra_list), - .cra_init = ablk_ecb_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}; +static int lrw_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), -static int ablk_cbc_init(struct crypto_tfm *tfm) + .table_ctx = &ctx->lrw_table, + .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), + .crypt_fn = lrw_xts_encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = lrw_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +static int lrw_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct cryptd_ablkcipher *cryptd_tfm; + struct aesni_lrw_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct lrw_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .table_ctx = &ctx->lrw_table, + .crypt_ctx = aes_ctx(ctx->raw_aes_ctx), + .crypt_fn = lrw_xts_decrypt_callback, + }; + int ret; - cryptd_tfm = cryptd_alloc_ablkcipher("__driver-cbc-aes-aesni", 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = lrw_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; } -static struct crypto_alg ablk_cbc_alg = { - .cra_name = "cbc(aes)", - .cra_driver_name = "cbc-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_cbc_alg.cra_list), - .cra_init = ablk_cbc_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - }, - }, -}; +static int xts_aesni_setkey(struct crypto_tfm *tfm, const u8 *key, + unsigned int keylen) +{ + struct aesni_xts_ctx *ctx = crypto_tfm_ctx(tfm); + u32 *flags = &tfm->crt_flags; + int err; -#ifdef HAS_CTR -static int ablk_ctr_init(struct crypto_tfm *tfm) + /* key consists of keys of equal size concatenated, therefore + * the length must be even + */ + if (keylen % 2) { + *flags |= CRYPTO_TFM_RES_BAD_KEY_LEN; + return -EINVAL; + } + + /* first half of xts-key is for crypt */ + err = aes_set_key_common(tfm, ctx->raw_crypt_ctx, key, keylen / 2); + if (err) + return err; + + /* second half of xts-key is for tweak */ + return aes_set_key_common(tfm, ctx->raw_tweak_ctx, key + keylen / 2, + keylen / 2); +} + + +static void aesni_xts_tweak(void *ctx, u8 *out, const u8 *in) { - struct cryptd_ablkcipher *cryptd_tfm; + aesni_enc(ctx, out, in); +} - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(ctr(__driver-aes-aesni))", - 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; +#ifdef CONFIG_X86_64 + +static void aesni_xts_enc(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_enc)); } -static struct crypto_alg ablk_ctr_alg = { - .cra_name = "ctr(aes)", - .cra_driver_name = "ctr-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct async_aes_ctx), - .cra_alignmask = 0, - .cra_type = &crypto_ablkcipher_type, - .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_ctr_alg.cra_list), - .cra_init = ablk_ctr_init, - .cra_exit = ablk_exit, - .cra_u = { - .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE, - .max_keysize = AES_MAX_KEY_SIZE, - .ivsize = AES_BLOCK_SIZE, - .setkey = ablk_set_key, - .encrypt = ablk_encrypt, - .decrypt = ablk_decrypt, - .geniv = "chainiv", - }, - }, +static void aesni_xts_dec(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + glue_xts_crypt_128bit_one(ctx, dst, src, iv, GLUE_FUNC_CAST(aesni_dec)); +} + +static void aesni_xts_enc8(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, true, (u8 *)iv); +} + +static void aesni_xts_dec8(void *ctx, u128 *dst, const u128 *src, le128 *iv) +{ + aesni_xts_crypt8(ctx, (u8 *)dst, (const u8 *)src, false, (u8 *)iv); +} + +static const struct common_glue_ctx aesni_enc_xts = { + .num_funcs = 2, + .fpu_blocks_limit = 1, + + .funcs = { { + .num_blocks = 8, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc8) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_enc) } + } } +}; + +static const struct common_glue_ctx aesni_dec_xts = { + .num_funcs = 2, + .fpu_blocks_limit = 1, + + .funcs = { { + .num_blocks = 8, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec8) } + }, { + .num_blocks = 1, + .fn_u = { .xts = GLUE_XTS_FUNC_CAST(aesni_xts_dec) } + } } }; -#endif -#ifdef HAS_LRW -static int ablk_lrw_init(struct crypto_tfm *tfm) +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) { - struct cryptd_ablkcipher *cryptd_tfm; + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(lrw(__driver-aes-aesni))", - 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; + return glue_xts_crypt_128bit(&aesni_enc_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); } -static struct crypto_alg ablk_lrw_alg = { - .cra_name = "lrw(aes)", - .cra_driver_name = "lrw-aes-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, - .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + + return glue_xts_crypt_128bit(&aesni_dec_xts, desc, dst, src, nbytes, + XTS_TWEAK_CAST(aesni_xts_tweak), + aes_ctx(ctx->raw_tweak_ctx), + aes_ctx(ctx->raw_crypt_ctx)); +} + +#else + +static int xts_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), + .tweak_fn = aesni_xts_tweak, + .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), + .crypt_fn = lrw_xts_encrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = xts_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +static int xts_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct aesni_xts_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + be128 buf[8]; + struct xts_crypt_req req = { + .tbuf = buf, + .tbuflen = sizeof(buf), + + .tweak_ctx = aes_ctx(ctx->raw_tweak_ctx), + .tweak_fn = aesni_xts_tweak, + .crypt_ctx = aes_ctx(ctx->raw_crypt_ctx), + .crypt_fn = lrw_xts_decrypt_callback, + }; + int ret; + + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + + kernel_fpu_begin(); + ret = xts_crypt(desc, dst, src, nbytes, &req); + kernel_fpu_end(); + + return ret; +} + +#endif + +#ifdef CONFIG_X86_64 +static int rfc4106_init(struct crypto_tfm *tfm) +{ + struct cryptd_aead *cryptd_tfm; + struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *) + PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); + struct crypto_aead *cryptd_child; + struct aesni_rfc4106_gcm_ctx *child_ctx; + cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + + cryptd_child = cryptd_aead_child(cryptd_tfm); + child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child); + memcpy(child_ctx, ctx, sizeof(*ctx)); + ctx->cryptd_tfm = cryptd_tfm; + tfm->crt_aead.reqsize = sizeof(struct aead_request) + + crypto_aead_reqsize(&cryptd_tfm->base); + return 0; +} + +static void rfc4106_exit(struct crypto_tfm *tfm) +{ + struct aesni_rfc4106_gcm_ctx *ctx = + (struct aesni_rfc4106_gcm_ctx *) + PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN); + if (!IS_ERR(ctx->cryptd_tfm)) + cryptd_free_aead(ctx->cryptd_tfm); + return; +} + +static void +rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err) +{ + struct aesni_gcm_set_hash_subkey_result *result = req->data; + + if (err == -EINPROGRESS) + return; + result->err = err; + complete(&result->completion); +} + +static int +rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) +{ + struct crypto_ablkcipher *ctr_tfm; + struct ablkcipher_request *req; + int ret = -EINVAL; + struct aesni_hash_subkey_req_data *req_data; + + ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0); + if (IS_ERR(ctr_tfm)) + return PTR_ERR(ctr_tfm); + + crypto_ablkcipher_clear_flags(ctr_tfm, ~0); + + ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len); + if (ret) + goto out_free_ablkcipher; + + ret = -ENOMEM; + req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL); + if (!req) + goto out_free_ablkcipher; + + req_data = kmalloc(sizeof(*req_data), GFP_KERNEL); + if (!req_data) + goto out_free_request; + + memset(req_data->iv, 0, sizeof(req_data->iv)); + + /* Clear the data in the hash sub key container to zero.*/ + /* We want to cipher all zeros to create the hash sub key. */ + memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE); + + init_completion(&req_data->result.completion); + sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE); + ablkcipher_request_set_tfm(req, ctr_tfm); + ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP | + CRYPTO_TFM_REQ_MAY_BACKLOG, + rfc4106_set_hash_subkey_done, + &req_data->result); + + ablkcipher_request_set_crypt(req, &req_data->sg, + &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv); + + ret = crypto_ablkcipher_encrypt(req); + if (ret == -EINPROGRESS || ret == -EBUSY) { + ret = wait_for_completion_interruptible + (&req_data->result.completion); + if (!ret) + ret = req_data->result.err; + } + kfree(req_data); +out_free_request: + ablkcipher_request_free(req); +out_free_ablkcipher: + crypto_free_ablkcipher(ctr_tfm); + return ret; +} + +static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key, + unsigned int key_len) +{ + int ret = 0; + struct crypto_tfm *tfm = crypto_aead_tfm(parent); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + struct aesni_rfc4106_gcm_ctx *child_ctx = + aesni_rfc4106_gcm_ctx_get(cryptd_child); + u8 *new_key_align, *new_key_mem = NULL; + + if (key_len < 4) { + crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + /*Account for 4 byte nonce at the end.*/ + key_len -= 4; + if (key_len != AES_KEYSIZE_128 && key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256) { + crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); + /*This must be on a 16 byte boundary!*/ + if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN) + return -EINVAL; + + if ((unsigned long)key % AESNI_ALIGN) { + /*key is not aligned: use an auxuliar aligned pointer*/ + new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL); + if (!new_key_mem) + return -ENOMEM; + + new_key_align = PTR_ALIGN(new_key_mem, AESNI_ALIGN); + memcpy(new_key_align, key, key_len); + key = new_key_align; + } + + if (!irq_fpu_usable()) + ret = crypto_aes_expand_key(&(ctx->aes_key_expanded), + key, key_len); + else { + kernel_fpu_begin(); + ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len); + kernel_fpu_end(); + } + /*This must be on a 16 byte boundary!*/ + if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) { + ret = -EINVAL; + goto exit; + } + ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); + memcpy(child_ctx, ctx, sizeof(*ctx)); +exit: + kfree(new_key_mem); + return ret; +} + +/* This is the Integrity Check Value (aka the authentication tag length and can + * be 8, 12 or 16 bytes long. */ +static int rfc4106_set_authsize(struct crypto_aead *parent, + unsigned int authsize) +{ + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent); + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + + switch (authsize) { + case 8: + case 12: + case 16: + break; + default: + return -EINVAL; + } + crypto_aead_crt(parent)->authsize = authsize; + crypto_aead_crt(cryptd_child)->authsize = authsize; + return 0; +} + +static int rfc4106_encrypt(struct aead_request *req) +{ + int ret; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + + if (!irq_fpu_usable()) { + struct aead_request *cryptd_req = + (struct aead_request *) aead_request_ctx(req); + memcpy(cryptd_req, req, sizeof(*req)); + aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + return crypto_aead_encrypt(cryptd_req); + } else { + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + kernel_fpu_begin(); + ret = cryptd_child->base.crt_aead.encrypt(req); + kernel_fpu_end(); + return ret; + } +} + +static int rfc4106_decrypt(struct aead_request *req) +{ + int ret; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + + if (!irq_fpu_usable()) { + struct aead_request *cryptd_req = + (struct aead_request *) aead_request_ctx(req); + memcpy(cryptd_req, req, sizeof(*req)); + aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base); + return crypto_aead_decrypt(cryptd_req); + } else { + struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm); + kernel_fpu_begin(); + ret = cryptd_child->base.crt_aead.decrypt(req); + kernel_fpu_end(); + return ret; + } +} + +static int __driver_rfc4106_encrypt(struct aead_request *req) +{ + u8 one_entry_in_sg = 0; + u8 *src, *dst, *assoc; + __be32 counter = cpu_to_be32(1); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + u32 key_len = ctx->aes_key_expanded.key_length; + void *aes_ctx = &(ctx->aes_key_expanded); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 iv_tab[16+AESNI_ALIGN]; + u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN); + struct scatter_walk src_sg_walk; + struct scatter_walk assoc_sg_walk; + struct scatter_walk dst_sg_walk; + unsigned int i; + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length equal */ + /* to 8 or 12 bytes */ + if (unlikely(req->assoclen != 8 && req->assoclen != 12)) + return -EINVAL; + if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) + return -EINVAL; + if (unlikely(key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256)) + return -EINVAL; + + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { + one_entry_in_sg = 1; + scatterwalk_start(&src_sg_walk, req->src); + scatterwalk_start(&assoc_sg_walk, req->assoc); + src = scatterwalk_map(&src_sg_walk, 0); + assoc = scatterwalk_map(&assoc_sg_walk, 0); + dst = src; + if (unlikely(req->src != req->dst)) { + scatterwalk_start(&dst_sg_walk, req->dst); + dst = scatterwalk_map(&dst_sg_walk, 0); + } + + } else { + /* Allocate memory for src, dst, assoc */ + src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen, + GFP_ATOMIC); + if (unlikely(!src)) + return -ENOMEM; + assoc = (src + req->cryptlen + auth_tag_len); + scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); + scatterwalk_map_and_copy(assoc, req->assoc, 0, + req->assoclen, 0); + dst = src; + } + + aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv, + ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst + + ((unsigned long)req->cryptlen), auth_tag_len); + + /* The authTag (aka the Integrity Check Value) needs to be written + * back to the packet. */ + if (one_entry_in_sg) { + if (unlikely(req->src != req->dst)) { + scatterwalk_unmap(dst, 0); + scatterwalk_done(&dst_sg_walk, 0, 0); + } + scatterwalk_unmap(src, 0); + scatterwalk_unmap(assoc, 0); + scatterwalk_done(&src_sg_walk, 0, 0); + scatterwalk_done(&assoc_sg_walk, 0, 0); + } else { + scatterwalk_map_and_copy(dst, req->dst, 0, + req->cryptlen + auth_tag_len, 1); + kfree(src); + } + return 0; +} + +static int __driver_rfc4106_decrypt(struct aead_request *req) +{ + u8 one_entry_in_sg = 0; + u8 *src, *dst, *assoc; + unsigned long tempCipherLen = 0; + __be32 counter = cpu_to_be32(1); + int retval = 0; + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); + u32 key_len = ctx->aes_key_expanded.key_length; + void *aes_ctx = &(ctx->aes_key_expanded); + unsigned long auth_tag_len = crypto_aead_authsize(tfm); + u8 iv_and_authTag[32+AESNI_ALIGN]; + u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN); + u8 *authTag = iv + 16; + struct scatter_walk src_sg_walk; + struct scatter_walk assoc_sg_walk; + struct scatter_walk dst_sg_walk; + unsigned int i; + + if (unlikely((req->cryptlen < auth_tag_len) || + (req->assoclen != 8 && req->assoclen != 12))) + return -EINVAL; + if (unlikely(auth_tag_len != 8 && auth_tag_len != 12 && auth_tag_len != 16)) + return -EINVAL; + if (unlikely(key_len != AES_KEYSIZE_128 && + key_len != AES_KEYSIZE_192 && + key_len != AES_KEYSIZE_256)) + return -EINVAL; + + /* Assuming we are supporting rfc4106 64-bit extended */ + /* sequence numbers We need to have the AAD length */ + /* equal to 8 or 12 bytes */ + + tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len); + /* IV below built */ + for (i = 0; i < 4; i++) + *(iv+i) = ctx->nonce[i]; + for (i = 0; i < 8; i++) + *(iv+4+i) = req->iv[i]; + *((__be32 *)(iv+12)) = counter; + + if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) { + one_entry_in_sg = 1; + scatterwalk_start(&src_sg_walk, req->src); + scatterwalk_start(&assoc_sg_walk, req->assoc); + src = scatterwalk_map(&src_sg_walk, 0); + assoc = scatterwalk_map(&assoc_sg_walk, 0); + dst = src; + if (unlikely(req->src != req->dst)) { + scatterwalk_start(&dst_sg_walk, req->dst); + dst = scatterwalk_map(&dst_sg_walk, 0); + } + + } else { + /* Allocate memory for src, dst, assoc */ + src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC); + if (!src) + return -ENOMEM; + assoc = (src + req->cryptlen); + scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0); + scatterwalk_map_and_copy(assoc, req->assoc, 0, + req->assoclen, 0); + dst = src; + } + + aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv, + ctx->hash_subkey, assoc, (unsigned long)req->assoclen, + authTag, auth_tag_len); + + /* Compare generated tag with passed in tag. */ + retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ? + -EBADMSG : 0; + + if (one_entry_in_sg) { + if (unlikely(req->src != req->dst)) { + scatterwalk_unmap(dst, 0); + scatterwalk_done(&dst_sg_walk, 0, 0); + } + scatterwalk_unmap(src, 0); + scatterwalk_unmap(assoc, 0); + scatterwalk_done(&src_sg_walk, 0, 0); + scatterwalk_done(&assoc_sg_walk, 0, 0); + } else { + scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1); + kfree(src); + } + return retval; +} +#endif + +static struct crypto_alg aesni_algs[] = { { + .cra_name = "aes", + .cra_driver_name = "aes-aesni", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = aes_encrypt, + .cia_decrypt = aes_decrypt + } + } +}, { + .cra_name = "__aes-aesni", + .cra_driver_name = "__driver-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_module = THIS_MODULE, + .cra_u = { + .cipher = { + .cia_min_keysize = AES_MIN_KEY_SIZE, + .cia_max_keysize = AES_MAX_KEY_SIZE, + .cia_setkey = aes_set_key, + .cia_encrypt = __aes_encrypt, + .cia_decrypt = __aes_decrypt + } + } +}, { + .cra_name = "__ecb-aes-aesni", + .cra_driver_name = "__driver-ecb-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aes_set_key, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}, { + .cra_name = "__cbc-aes-aesni", + .cra_driver_name = "__driver-cbc-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = aes_set_key, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}, { + .cra_name = "ecb(aes)", + .cra_driver_name = "ecb-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_lrw_alg.cra_list), - .cra_init = ablk_lrw_init, + .cra_init = ablk_ecb_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { - .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, - .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { + .cra_name = "cbc(aes)", + .cra_driver_name = "cbc-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_cbc_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, .ivsize = AES_BLOCK_SIZE, .setkey = ablk_set_key, .encrypt = ablk_encrypt, .decrypt = ablk_decrypt, }, }, -}; +#ifdef CONFIG_X86_64 +}, { + .cra_name = "__ctr-aes-aesni", + .cra_driver_name = "__driver-ctr-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct crypto_aes_ctx) + + AESNI_ALIGN - 1, + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = aes_set_key, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}, { + .cra_name = "ctr(aes)", + .cra_driver_name = "ctr-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_ctr_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE, + .max_keysize = AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_encrypt, + .geniv = "chainiv", + }, + }, +}, { + .cra_name = "__gcm-aes-aesni", + .cra_driver_name = "__driver-gcm-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_AEAD, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + + AESNI_ALIGN, + .cra_alignmask = 0, + .cra_type = &crypto_aead_type, + .cra_module = THIS_MODULE, + .cra_u = { + .aead = { + .encrypt = __driver_rfc4106_encrypt, + .decrypt = __driver_rfc4106_decrypt, + }, + }, +}, { + .cra_name = "rfc4106(gcm(aes))", + .cra_driver_name = "rfc4106-gcm-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + + AESNI_ALIGN, + .cra_alignmask = 0, + .cra_type = &crypto_nivaead_type, + .cra_module = THIS_MODULE, + .cra_init = rfc4106_init, + .cra_exit = rfc4106_exit, + .cra_u = { + .aead = { + .setkey = rfc4106_set_key, + .setauthsize = rfc4106_set_authsize, + .encrypt = rfc4106_encrypt, + .decrypt = rfc4106_decrypt, + .geniv = "seqiv", + .ivsize = 8, + .maxauthsize = 16, + }, + }, #endif - #ifdef HAS_PCBC -static int ablk_pcbc_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(pcbc(__driver-aes-aesni))", - 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static struct crypto_alg ablk_pcbc_alg = { +}, { .cra_name = "pcbc(aes)", .cra_driver_name = "pcbc-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_pcbc_alg.cra_list), .cra_init = ablk_pcbc_init, .cra_exit = ablk_exit, .cra_u = { @@ -582,34 +1268,81 @@ static struct crypto_alg ablk_pcbc_alg = .decrypt = ablk_decrypt, }, }, -}; #endif - -#ifdef HAS_XTS -static int ablk_xts_init(struct crypto_tfm *tfm) -{ - struct cryptd_ablkcipher *cryptd_tfm; - - cryptd_tfm = cryptd_alloc_ablkcipher("fpu(xts(__driver-aes-aesni))", - 0, 0); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - ablk_init_common(tfm, cryptd_tfm); - return 0; -} - -static struct crypto_alg ablk_xts_alg = { +}, { + .cra_name = "__lrw-aes-aesni", + .cra_driver_name = "__driver-lrw-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aesni_lrw_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_exit = lrw_aesni_exit_tfm, + .cra_u = { + .blkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = lrw_aesni_setkey, + .encrypt = lrw_encrypt, + .decrypt = lrw_decrypt, + }, + }, +}, { + .cra_name = "__xts-aes-aesni", + .cra_driver_name = "__driver-xts-aes-aesni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct aesni_xts_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_u = { + .blkcipher = { + .min_keysize = 2 * AES_MIN_KEY_SIZE, + .max_keysize = 2 * AES_MAX_KEY_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = xts_aesni_setkey, + .encrypt = xts_encrypt, + .decrypt = xts_decrypt, + }, + }, +}, { + .cra_name = "lrw(aes)", + .cra_driver_name = "lrw-aes-aesni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, + .cra_blocksize = AES_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct async_helper_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_ablkcipher_type, + .cra_module = THIS_MODULE, + .cra_init = ablk_init, + .cra_exit = ablk_exit, + .cra_u = { + .ablkcipher = { + .min_keysize = AES_MIN_KEY_SIZE + AES_BLOCK_SIZE, + .max_keysize = AES_MAX_KEY_SIZE + AES_BLOCK_SIZE, + .ivsize = AES_BLOCK_SIZE, + .setkey = ablk_set_key, + .encrypt = ablk_encrypt, + .decrypt = ablk_decrypt, + }, + }, +}, { .cra_name = "xts(aes)", .cra_driver_name = "xts-aes-aesni", .cra_priority = 400, - .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER|CRYPTO_ALG_ASYNC, + .cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | CRYPTO_ALG_ASYNC, .cra_blocksize = AES_BLOCK_SIZE, - .cra_ctxsize = sizeof(struct async_aes_ctx), + .cra_ctxsize = sizeof(struct async_helper_ctx), .cra_alignmask = 0, .cra_type = &crypto_ablkcipher_type, .cra_module = THIS_MODULE, - .cra_list = LIST_HEAD_INIT(ablk_xts_alg.cra_list), - .cra_init = ablk_xts_init, + .cra_init = ablk_init, .cra_exit = ablk_exit, .cra_u = { .ablkcipher = { @@ -621,98 +1354,32 @@ static struct crypto_alg ablk_xts_alg = .decrypt = ablk_decrypt, }, }, -}; -#endif +} }; static int __init aesni_init(void) { - int err; + int err, i; if (!cpu_has_aes) { printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); return -ENODEV; } - if ((err = crypto_register_alg(&aesni_alg))) - goto aes_err; - if ((err = crypto_register_alg(&__aesni_alg))) - goto __aes_err; - if ((err = crypto_register_alg(&blk_ecb_alg))) - goto blk_ecb_err; - if ((err = crypto_register_alg(&blk_cbc_alg))) - goto blk_cbc_err; - if ((err = crypto_register_alg(&ablk_ecb_alg))) - goto ablk_ecb_err; - if ((err = crypto_register_alg(&ablk_cbc_alg))) - goto ablk_cbc_err; -#ifdef HAS_CTR - if ((err = crypto_register_alg(&ablk_ctr_alg))) - goto ablk_ctr_err; -#endif -#ifdef HAS_LRW - if ((err = crypto_register_alg(&ablk_lrw_alg))) - goto ablk_lrw_err; -#endif -#ifdef HAS_PCBC - if ((err = crypto_register_alg(&ablk_pcbc_alg))) - goto ablk_pcbc_err; -#endif -#ifdef HAS_XTS - if ((err = crypto_register_alg(&ablk_xts_alg))) - goto ablk_xts_err; -#endif - return err; + err = crypto_fpu_init(); + if (err) + return err; -#ifdef HAS_XTS -ablk_xts_err: -#endif -#ifdef HAS_PCBC - crypto_unregister_alg(&ablk_pcbc_alg); -ablk_pcbc_err: -#endif -#ifdef HAS_LRW - crypto_unregister_alg(&ablk_lrw_alg); -ablk_lrw_err: -#endif -#ifdef HAS_CTR - crypto_unregister_alg(&ablk_ctr_alg); -ablk_ctr_err: -#endif - crypto_unregister_alg(&ablk_cbc_alg); -ablk_cbc_err: - crypto_unregister_alg(&ablk_ecb_alg); -ablk_ecb_err: - crypto_unregister_alg(&blk_cbc_alg); -blk_cbc_err: - crypto_unregister_alg(&blk_ecb_alg); -blk_ecb_err: - crypto_unregister_alg(&__aesni_alg); -__aes_err: - crypto_unregister_alg(&aesni_alg); -aes_err: - return err; + for (i = 0; i < ARRAY_SIZE(aesni_algs); i++) + INIT_LIST_HEAD(&aesni_algs[i].cra_list); + + return crypto_register_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); } static void __exit aesni_exit(void) { -#ifdef HAS_XTS - crypto_unregister_alg(&ablk_xts_alg); -#endif -#ifdef HAS_PCBC - crypto_unregister_alg(&ablk_pcbc_alg); -#endif -#ifdef HAS_LRW - crypto_unregister_alg(&ablk_lrw_alg); -#endif -#ifdef HAS_CTR - crypto_unregister_alg(&ablk_ctr_alg); -#endif - crypto_unregister_alg(&ablk_cbc_alg); - crypto_unregister_alg(&ablk_ecb_alg); - crypto_unregister_alg(&blk_cbc_alg); - crypto_unregister_alg(&blk_ecb_alg); - crypto_unregister_alg(&__aesni_alg); - crypto_unregister_alg(&aesni_alg); + crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); + + crypto_fpu_exit(); } module_init(aesni_init); diff -uprN linux-2.6.32/arch/x86/crypto/aes-x86_64-asm_64.S linux-2.6.32.ovz/arch/x86/crypto/aes-x86_64-asm_64.S --- linux-2.6.32/arch/x86/crypto/aes-x86_64-asm_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/aes-x86_64-asm_64.S 2015-03-10 13:23:55.000000000 -0700 @@ -15,6 +15,7 @@ .text +#include #include #define R1 %rax @@ -49,10 +50,8 @@ #define R11 %r11 #define prologue(FUNC,KEY,B128,B192,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11) \ - .global FUNC; \ - .type FUNC,@function; \ - .align 8; \ -FUNC: movq r1,r2; \ + ENTRY(FUNC); \ + movq r1,r2; \ movq r3,r4; \ leaq KEY+48(r8),r9; \ movq r10,r11; \ @@ -71,14 +70,15 @@ FUNC: movq r1,r2; \ je B192; \ leaq 32(r9),r9; -#define epilogue(r1,r2,r3,r4,r5,r6,r7,r8,r9) \ +#define epilogue(FUNC,r1,r2,r3,r4,r5,r6,r7,r8,r9) \ movq r1,r2; \ movq r3,r4; \ movl r5 ## E,(r9); \ movl r6 ## E,4(r9); \ movl r7 ## E,8(r9); \ movl r8 ## E,12(r9); \ - ret; + ret; \ + ENDPROC(FUNC); #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ movzbl r2 ## H,r5 ## E; \ @@ -133,7 +133,7 @@ FUNC: movq r1,r2; \ #define entry(FUNC,KEY,B128,B192) \ prologue(FUNC,KEY,B128,B192,R2,R8,R7,R9,R1,R3,R4,R6,R10,R5,R11) -#define return epilogue(R8,R2,R9,R7,R5,R6,R3,R4,R11) +#define return(FUNC) epilogue(FUNC,R8,R2,R9,R7,R5,R6,R3,R4,R11) #define encrypt_round(TAB,OFFSET) \ round(TAB,OFFSET,R1,R2,R3,R4,R5,R6,R7,R10,R5,R6,R3,R4) \ @@ -151,12 +151,12 @@ FUNC: movq r1,r2; \ /* void aes_enc_blk(stuct crypto_tfm *tfm, u8 *out, const u8 *in) */ - entry(aes_enc_blk,0,enc128,enc192) + entry(aes_enc_blk,0,.Le128,.Le192) encrypt_round(crypto_ft_tab,-96) encrypt_round(crypto_ft_tab,-80) -enc192: encrypt_round(crypto_ft_tab,-64) +.Le192: encrypt_round(crypto_ft_tab,-64) encrypt_round(crypto_ft_tab,-48) -enc128: encrypt_round(crypto_ft_tab,-32) +.Le128: encrypt_round(crypto_ft_tab,-32) encrypt_round(crypto_ft_tab,-16) encrypt_round(crypto_ft_tab, 0) encrypt_round(crypto_ft_tab, 16) @@ -166,16 +166,16 @@ enc128: encrypt_round(crypto_ft_tab,-32) encrypt_round(crypto_ft_tab, 80) encrypt_round(crypto_ft_tab, 96) encrypt_final(crypto_fl_tab,112) - return + return(aes_enc_blk) /* void aes_dec_blk(struct crypto_tfm *tfm, u8 *out, const u8 *in) */ - entry(aes_dec_blk,240,dec128,dec192) + entry(aes_dec_blk,240,.Ld128,.Ld192) decrypt_round(crypto_it_tab,-96) decrypt_round(crypto_it_tab,-80) -dec192: decrypt_round(crypto_it_tab,-64) +.Ld192: decrypt_round(crypto_it_tab,-64) decrypt_round(crypto_it_tab,-48) -dec128: decrypt_round(crypto_it_tab,-32) +.Ld128: decrypt_round(crypto_it_tab,-32) decrypt_round(crypto_it_tab,-16) decrypt_round(crypto_it_tab, 0) decrypt_round(crypto_it_tab, 16) @@ -185,4 +185,4 @@ dec128: decrypt_round(crypto_it_tab,-32) decrypt_round(crypto_it_tab, 80) decrypt_round(crypto_it_tab, 96) decrypt_final(crypto_il_tab,112) - return + return(aes_dec_blk) diff -uprN linux-2.6.32/arch/x86/crypto/crc32c-intel.c linux-2.6.32.ovz/arch/x86/crypto/crc32c-intel.c --- linux-2.6.32/arch/x86/crypto/crc32c-intel.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/crc32c-intel.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,198 +0,0 @@ -/* - * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal. - * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE) - * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at: - * http://www.intel.com/products/processor/manuals/ - * Intel(R) 64 and IA-32 Architectures Software Developer's Manual - * Volume 2A: Instruction Set Reference, A-M - * - * Copyright (C) 2008 Intel Corporation - * Authors: Austin Zhang - * Kent Liu - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ -#include -#include -#include -#include -#include - -#include - -#define CHKSUM_BLOCK_SIZE 1 -#define CHKSUM_DIGEST_SIZE 4 - -#define SCALE_F sizeof(unsigned long) - -#ifdef CONFIG_X86_64 -#define REX_PRE "0x48, " -#else -#define REX_PRE -#endif - -static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) -{ - while (length--) { - __asm__ __volatile__( - ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1" - :"=S"(crc) - :"0"(crc), "c"(*data) - ); - data++; - } - - return crc; -} - -static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len) -{ - unsigned int iquotient = len / SCALE_F; - unsigned int iremainder = len % SCALE_F; - unsigned long *ptmp = (unsigned long *)p; - - while (iquotient--) { - __asm__ __volatile__( - ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;" - :"=S"(crc) - :"0"(crc), "c"(*ptmp) - ); - ptmp++; - } - - if (iremainder) - crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, - iremainder); - - return crc; -} - -/* - * Setting the seed allows arbitrary accumulators and flexible XOR policy - * If your algorithm starts with ~0, then XOR with ~0 before you set - * the seed. - */ -static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, - unsigned int keylen) -{ - u32 *mctx = crypto_shash_ctx(hash); - - if (keylen != sizeof(u32)) { - crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); - return -EINVAL; - } - *mctx = le32_to_cpup((__le32 *)key); - return 0; -} - -static int crc32c_intel_init(struct shash_desc *desc) -{ - u32 *mctx = crypto_shash_ctx(desc->tfm); - u32 *crcp = shash_desc_ctx(desc); - - *crcp = *mctx; - - return 0; -} - -static int crc32c_intel_update(struct shash_desc *desc, const u8 *data, - unsigned int len) -{ - u32 *crcp = shash_desc_ctx(desc); - - *crcp = crc32c_intel_le_hw(*crcp, data, len); - return 0; -} - -static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len, - u8 *out) -{ - *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); - return 0; -} - -static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out); -} - -static int crc32c_intel_final(struct shash_desc *desc, u8 *out) -{ - u32 *crcp = shash_desc_ctx(desc); - - *(__le32 *)out = ~cpu_to_le32p(crcp); - return 0; -} - -static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *out) -{ - return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len, - out); -} - -static int crc32c_intel_cra_init(struct crypto_tfm *tfm) -{ - u32 *key = crypto_tfm_ctx(tfm); - - *key = ~0; - - return 0; -} - -static struct shash_alg alg = { - .setkey = crc32c_intel_setkey, - .init = crc32c_intel_init, - .update = crc32c_intel_update, - .final = crc32c_intel_final, - .finup = crc32c_intel_finup, - .digest = crc32c_intel_digest, - .descsize = sizeof(u32), - .digestsize = CHKSUM_DIGEST_SIZE, - .base = { - .cra_name = "crc32c", - .cra_driver_name = "crc32c-intel", - .cra_priority = 200, - .cra_blocksize = CHKSUM_BLOCK_SIZE, - .cra_ctxsize = sizeof(u32), - .cra_module = THIS_MODULE, - .cra_init = crc32c_intel_cra_init, - } -}; - - -static int __init crc32c_intel_mod_init(void) -{ - if (cpu_has_xmm4_2) - return crypto_register_shash(&alg); - else - return -ENODEV; -} - -static void __exit crc32c_intel_mod_fini(void) -{ - crypto_unregister_shash(&alg); -} - -module_init(crc32c_intel_mod_init); -module_exit(crc32c_intel_mod_fini); - -MODULE_AUTHOR("Austin Zhang , Kent Liu "); -MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); -MODULE_LICENSE("GPL"); - -MODULE_ALIAS("crc32c"); -MODULE_ALIAS("crc32c-intel"); diff -uprN linux-2.6.32/arch/x86/crypto/crc32c-intel_glue.c linux-2.6.32.ovz/arch/x86/crypto/crc32c-intel_glue.c --- linux-2.6.32/arch/x86/crypto/crc32c-intel_glue.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/crc32c-intel_glue.c 2015-03-10 13:24:15.000000000 -0700 @@ -0,0 +1,269 @@ +/* + * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal. + * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE) + * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2A: Instruction Set Reference, A-M + * + * Copyright (C) 2008 Intel Corporation + * Authors: Austin Zhang + * Kent Liu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ +#include +#include +#include +#include +#include + +#include +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +#define SCALE_F sizeof(unsigned long) + +#ifdef CONFIG_X86_64 +#define REX_PRE "0x48, " +#else +#define REX_PRE +#endif + +#ifdef CONFIG_X86_64 +/* + * use carryless multiply version of crc32c when buffer + * size is >= 512 (when eager fpu is enabled) or + * >= 1024 (when eager fpu is disabled) to account + * for fpu state save/restore overhead. + */ +#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 +#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024 + +asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, + unsigned int crc_init); +static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; +#define set_pcl_breakeven_point() \ + (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU) +#endif /* CONFIG_X86_64 */ + +static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) +{ + while (length--) { + __asm__ __volatile__( + ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1" + :"=S"(crc) + :"0"(crc), "c"(*data) + ); + data++; + } + + return crc; +} + +static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len) +{ + unsigned int iquotient = len / SCALE_F; + unsigned int iremainder = len % SCALE_F; + unsigned long *ptmp = (unsigned long *)p; + + while (iquotient--) { + __asm__ __volatile__( + ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;" + :"=S"(crc) + :"0"(crc), "c"(*ptmp) + ); + ptmp++; + } + + if (iremainder) + crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, + iremainder); + + return crc; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32c_intel_setkey(struct crypto_shash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_shash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_shash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32c_intel_init(struct shash_desc *desc) +{ + u32 *mctx = crypto_shash_ctx(desc->tfm); + u32 *crcp = shash_desc_ctx(desc); + + *crcp = *mctx; + + return 0; +} + +static int crc32c_intel_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + *crcp = crc32c_intel_le_hw(*crcp, data, len); + return 0; +} + +static int __crc32c_intel_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__le32 *)out = ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); + return 0; +} + +static int crc32c_intel_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_intel_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_intel_final(struct shash_desc *desc, u8 *out) +{ + u32 *crcp = shash_desc_ctx(desc); + + *(__le32 *)out = ~cpu_to_le32p(crcp); + return 0; +} + +static int crc32c_intel_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_intel_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} + +static int crc32c_intel_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = ~0; + + return 0; +} + +#ifdef CONFIG_X86_64 +static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + u32 *crcp = shash_desc_ctx(desc); + + /* + * use faster PCL version if datasize is large enough to + * overcome kernel fpu state save/restore overhead + */ + if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + kernel_fpu_begin(); + *crcp = crc_pcl(data, len, *crcp); + kernel_fpu_end(); + } else + *crcp = crc32c_intel_le_hw(*crcp, data, len); + return 0; +} + +static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + kernel_fpu_begin(); + *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); + kernel_fpu_end(); + } else + *(__le32 *)out = + ~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len)); + return 0; +} + +static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out); +} + +static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len, + out); +} +#endif /* CONFIG_X86_64 */ + +static struct shash_alg alg = { + .setkey = crc32c_intel_setkey, + .init = crc32c_intel_init, + .update = crc32c_intel_update, + .final = crc32c_intel_final, + .finup = crc32c_intel_finup, + .digest = crc32c_intel_digest, + .descsize = sizeof(u32), + .digestsize = CHKSUM_DIGEST_SIZE, + .base = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-intel", + .cra_priority = 200, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_init = crc32c_intel_cra_init, + } +}; + + +static int __init crc32c_intel_mod_init(void) +{ + if (!cpu_has_xmm4_2) + return -ENODEV; +#ifdef CONFIG_X86_64 + if (cpu_has_pclmulqdq) { + alg.update = crc32c_pcl_intel_update; + alg.finup = crc32c_pcl_intel_finup; + alg.digest = crc32c_pcl_intel_digest; + set_pcl_breakeven_point(); + } +#endif + return crypto_register_shash(&alg); +} + +static void __exit crc32c_intel_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crc32c_intel_mod_init); +module_exit(crc32c_intel_mod_fini); + +MODULE_AUTHOR("Austin Zhang , Kent Liu "); +MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crc32c"); +MODULE_ALIAS("crc32c-intel"); diff -uprN linux-2.6.32/arch/x86/crypto/crc32c-pcl-intel-asm_64.S linux-2.6.32.ovz/arch/x86/crypto/crc32c-pcl-intel-asm_64.S --- linux-2.6.32/arch/x86/crypto/crc32c-pcl-intel-asm_64.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/crc32c-pcl-intel-asm_64.S 2015-03-10 13:24:15.000000000 -0700 @@ -0,0 +1,460 @@ +/* + * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) + * + * The white paper on CRC32C calculations with PCLMULQDQ instruction can be + * downloaded from: + * http://download.intel.com/design/intarch/papers/323405.pdf + * + * Copyright (C) 2012 Intel Corporation. + * + * Authors: + * Wajdi Feghali + * James Guilford + * David Cote + * Tim Chen + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +.macro LABEL prefix n +\prefix\n\(): +.endm + +.macro JMPTBL_ENTRY i +.word crc_\i - crc_array +.endm + +.macro JNC_LESS_THAN j + jnc less_than_\j +.endm + +# Define threshold where buffers are considered "small" and routed to more +# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +# SMALL_SIZE can be no larger than 255. + +#define SMALL_SIZE 200 + +.if (SMALL_SIZE > 255) +.error "SMALL_ SIZE must be < 256" +.endif + +# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); + +.global crc_pcl +crc_pcl: +#define bufp %rdi +#define bufp_dw %edi +#define bufp_w %di +#define bufp_b %dil +#define bufptmp %rcx +#define block_0 %rcx +#define block_1 %rdx +#define block_2 %r11 +#define len %rsi +#define len_dw %esi +#define len_w %si +#define len_b %sil +#define crc_init_arg %rdx +#define tmp %rbx +#define crc_init %r8 +#define crc_init_dw %r8d +#define crc1 %r9 +#define crc2 %r10 + + pushq %rbx + pushq %rdi + pushq %rsi + + ## Move crc_init for Linux to a different + mov crc_init_arg, crc_init + + ################################################################ + ## 1) ALIGN: + ################################################################ + + mov bufp, bufptmp # rdi = *buf + neg bufp + and $7, bufp # calculate the unalignment amount of + # the address + je proc_block # Skip if aligned + + ## If len is less than 8 and we're unaligned, we need to jump + ## to special code to avoid reading beyond the end of the buffer + cmp $8, len + jae do_align + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $32-3+1, len_dw + jmp less_than_8_post_shl1 + +do_align: + #### Calculate CRC of unaligned bytes of the buffer (if any) + movq (bufptmp), tmp # load a quadward from the buffer + add bufp, bufptmp # align buffer pointer for quadword + # processing + sub bufp, len # update buffer length +align_loop: + crc32b %bl, crc_init_dw # compute crc32 of 1-byte + shr $8, tmp # get next byte + dec bufp + jne align_loop + +proc_block: + + ################################################################ + ## 2) PROCESS BLOCKS: + ################################################################ + + ## compute num of bytes to be processed + movq len, tmp # save num bytes in tmp + + cmpq $128*24, len + jae full_block + +continue_block: + cmpq $SMALL_SIZE, len + jb small + + ## len < 128*24 + movq $2731, %rax # 2731 = ceil(2^16 / 24) + mul len_dw + shrq $16, %rax + + ## eax contains floor(bytes / 24) = num 24-byte chunks to do + + ## process rax 24-byte chunks (128 >= rax >= 0) + + ## compute end address of each block + ## block 0 (base addr + RAX * 8) + ## block 1 (base addr + RAX * 16) + ## block 2 (base addr + RAX * 24) + lea (bufptmp, %rax, 8), block_0 + lea (block_0, %rax, 8), block_1 + lea (block_1, %rax, 8), block_2 + + xor crc1, crc1 + xor crc2, crc2 + + ## branch into array + lea jump_table(%rip), bufp + movzxw (bufp, %rax, 2), len + offset=crc_array-jump_table + lea offset(bufp, len, 1), bufp + jmp *bufp + + ################################################################ + ## 2a) PROCESS FULL BLOCKS: + ################################################################ +full_block: + movq $128,%rax + lea 128*8*2(block_0), block_1 + lea 128*8*3(block_0), block_2 + add $128*8*1, block_0 + + xor crc1,crc1 + xor crc2,crc2 + + # Fall thruogh into top of crc array (crc_128) + + ################################################################ + ## 3) CRC Array: + ################################################################ + +crc_array: + i=128 +.rept 128-1 +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 + crc32q -i*8(block_2), crc2 + i=(i-1) +.endr + +.altmacro +LABEL crc_ %i +.noaltmacro + crc32q -i*8(block_0), crc_init + crc32q -i*8(block_1), crc1 +# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet + + mov block_2, block_0 + + ################################################################ + ## 4) Combine three results: + ################################################################ + + lea (K_table-16)(%rip), bufp # first entry is for idx 1 + shlq $3, %rax # rax *= 8 + subq %rax, tmp # tmp -= rax*8 + shlq $1, %rax + subq %rax, tmp # tmp -= rax*16 + # (total tmp -= rax*24) + addq %rax, bufp + + movdqa (bufp), %xmm0 # 2 consts: K1:K2 + + movq crc_init, %xmm1 # CRC for block 1 + pclmulqdq $0x00,%xmm0,%xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 + pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 + + pxor %xmm2,%xmm1 + movq %xmm1, %rax + xor -i*8(block_2), %rax + mov crc2, crc_init + crc32 %rax, crc_init + +################################################################ +## 5) Check for end: +################################################################ + +LABEL crc_ 0 + mov tmp, len + cmp $128*24, tmp + jae full_block + cmp $24, tmp + jae continue_block + +less_than_24: + shl $32-4, len_dw # less_than_16 expects length + # in upper 4 bits of len_dw + jnc less_than_16 + crc32q (bufptmp), crc_init + crc32q 8(bufptmp), crc_init + jz do_return + add $16, bufptmp + # len is less than 8 if we got here + # less_than_8 expects length in upper 3 bits of len_dw + # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] + shl $2, len_dw + jmp less_than_8_post_shl1 + + ####################################################################### + ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) + ####################################################################### +small: + shl $32-8, len_dw # Prepare len_dw for less_than_256 + j=256 +.rept 5 # j = {256, 128, 64, 32, 16} +.altmacro +LABEL less_than_ %j # less_than_j: Length should be in + # upper lg(j) bits of len_dw + j=(j/2) + shl $1, len_dw # Get next MSB + JNC_LESS_THAN %j +.noaltmacro + i=0 +.rept (j/8) + crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data + i=i+8 +.endr + jz do_return # Return if remaining length is zero + add $j, bufptmp # Advance buf +.endr + +less_than_8: # Length should be stored in + # upper 3 bits of len_dw + shl $1, len_dw +less_than_8_post_shl1: + jnc less_than_4 + crc32l (bufptmp), crc_init_dw # CRC of 4 bytes + jz do_return # return if remaining data is zero + add $4, bufptmp +less_than_4: # Length should be stored in + # upper 2 bits of len_dw + shl $1, len_dw + jnc less_than_2 + crc32w (bufptmp), crc_init_dw # CRC of 2 bytes + jz do_return # return if remaining data is zero + add $2, bufptmp +less_than_2: # Length should be stored in the MSB + # of len_dw + shl $1, len_dw + jnc less_than_1 + crc32b (bufptmp), crc_init_dw # CRC of 1 byte +less_than_1: # Length should be zero +do_return: + movq crc_init, %rax + popq %rsi + popq %rdi + popq %rbx + ret + + ################################################################ + ## jump table Table is 129 entries x 2 bytes each + ################################################################ +.align 4 +jump_table: + i=0 +.rept 129 +.altmacro +JMPTBL_ENTRY %i +.noaltmacro + i=i+1 +.endr + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 quad words each + ################################################################ +.data +.align 64 +K_table: + .quad 0x14cd00bd6,0x105ec76f0 + .quad 0x0ba4fc28e,0x14cd00bd6 + .quad 0x1d82c63da,0x0f20c0dfe + .quad 0x09e4addf8,0x0ba4fc28e + .quad 0x039d3b296,0x1384aa63a + .quad 0x102f9b8a2,0x1d82c63da + .quad 0x14237f5e6,0x01c291d04 + .quad 0x00d3b6092,0x09e4addf8 + .quad 0x0c96cfdc0,0x0740eef02 + .quad 0x18266e456,0x039d3b296 + .quad 0x0daece73e,0x0083a6eec + .quad 0x0ab7aff2a,0x102f9b8a2 + .quad 0x1248ea574,0x1c1733996 + .quad 0x083348832,0x14237f5e6 + .quad 0x12c743124,0x02ad91c30 + .quad 0x0b9e02b86,0x00d3b6092 + .quad 0x018b33a4e,0x06992cea2 + .quad 0x1b331e26a,0x0c96cfdc0 + .quad 0x17d35ba46,0x07e908048 + .quad 0x1bf2e8b8a,0x18266e456 + .quad 0x1a3e0968a,0x11ed1f9d8 + .quad 0x0ce7f39f4,0x0daece73e + .quad 0x061d82e56,0x0f1d0f55e + .quad 0x0d270f1a2,0x0ab7aff2a + .quad 0x1c3f5f66c,0x0a87ab8a8 + .quad 0x12ed0daac,0x1248ea574 + .quad 0x065863b64,0x08462d800 + .quad 0x11eef4f8e,0x083348832 + .quad 0x1ee54f54c,0x071d111a8 + .quad 0x0b3e32c28,0x12c743124 + .quad 0x0064f7f26,0x0ffd852c6 + .quad 0x0dd7e3b0c,0x0b9e02b86 + .quad 0x0f285651c,0x0dcb17aa4 + .quad 0x010746f3c,0x018b33a4e + .quad 0x1c24afea4,0x0f37c5aee + .quad 0x0271d9844,0x1b331e26a + .quad 0x08e766a0c,0x06051d5a2 + .quad 0x093a5f730,0x17d35ba46 + .quad 0x06cb08e5c,0x11d5ca20e + .quad 0x06b749fb2,0x1bf2e8b8a + .quad 0x1167f94f2,0x021f3d99c + .quad 0x0cec3662e,0x1a3e0968a + .quad 0x19329634a,0x08f158014 + .quad 0x0e6fc4e6a,0x0ce7f39f4 + .quad 0x08227bb8a,0x1a5e82106 + .quad 0x0b0cd4768,0x061d82e56 + .quad 0x13c2b89c4,0x188815ab2 + .quad 0x0d7a4825c,0x0d270f1a2 + .quad 0x10f5ff2ba,0x105405f3e + .quad 0x00167d312,0x1c3f5f66c + .quad 0x0f6076544,0x0e9adf796 + .quad 0x026f6a60a,0x12ed0daac + .quad 0x1a2adb74e,0x096638b34 + .quad 0x19d34af3a,0x065863b64 + .quad 0x049c3cc9c,0x1e50585a0 + .quad 0x068bce87a,0x11eef4f8e + .quad 0x1524fa6c6,0x19f1c69dc + .quad 0x16cba8aca,0x1ee54f54c + .quad 0x042d98888,0x12913343e + .quad 0x1329d9f7e,0x0b3e32c28 + .quad 0x1b1c69528,0x088f25a3a + .quad 0x02178513a,0x0064f7f26 + .quad 0x0e0ac139e,0x04e36f0b0 + .quad 0x0170076fa,0x0dd7e3b0c + .quad 0x141a1a2e2,0x0bd6f81f8 + .quad 0x16ad828b4,0x0f285651c + .quad 0x041d17b64,0x19425cbba + .quad 0x1fae1cc66,0x010746f3c + .quad 0x1a75b4b00,0x18db37e8a + .quad 0x0f872e54c,0x1c24afea4 + .quad 0x01e41e9fc,0x04c144932 + .quad 0x086d8e4d2,0x0271d9844 + .quad 0x160f7af7a,0x052148f02 + .quad 0x05bb8f1bc,0x08e766a0c + .quad 0x0a90fd27a,0x0a3c6f37a + .quad 0x0b3af077a,0x093a5f730 + .quad 0x04984d782,0x1d22c238e + .quad 0x0ca6ef3ac,0x06cb08e5c + .quad 0x0234e0b26,0x063ded06a + .quad 0x1d88abd4a,0x06b749fb2 + .quad 0x04597456a,0x04d56973c + .quad 0x0e9e28eb4,0x1167f94f2 + .quad 0x07b3ff57a,0x19385bf2e + .quad 0x0c9c8b782,0x0cec3662e + .quad 0x13a9cba9e,0x0e417f38a + .quad 0x093e106a4,0x19329634a + .quad 0x167001a9c,0x14e727980 + .quad 0x1ddffc5d4,0x0e6fc4e6a + .quad 0x00df04680,0x0d104b8fc + .quad 0x02342001e,0x08227bb8a + .quad 0x00a2a8d7e,0x05b397730 + .quad 0x168763fa6,0x0b0cd4768 + .quad 0x1ed5a407a,0x0e78eb416 + .quad 0x0d2c3ed1a,0x13c2b89c4 + .quad 0x0995a5724,0x1641378f0 + .quad 0x19b1afbc4,0x0d7a4825c + .quad 0x109ffedc0,0x08d96551c + .quad 0x0f2271e60,0x10f5ff2ba + .quad 0x00b0bf8ca,0x00bf80dd2 + .quad 0x123888b7a,0x00167d312 + .quad 0x1e888f7dc,0x18dcddd1c + .quad 0x002ee03b2,0x0f6076544 + .quad 0x183e8d8fe,0x06a45d2b2 + .quad 0x133d7a042,0x026f6a60a + .quad 0x116b0f50c,0x1dd3e10e8 + .quad 0x05fabe670,0x1a2adb74e + .quad 0x130004488,0x0de87806c + .quad 0x000bcf5f6,0x19d34af3a + .quad 0x18f0c7078,0x014338754 + .quad 0x017f27698,0x049c3cc9c + .quad 0x058ca5f00,0x15e3e77ee + .quad 0x1af900c24,0x068bce87a + .quad 0x0b5cfca28,0x0dd07448e + .quad 0x0ded288f8,0x1524fa6c6 + .quad 0x059f229bc,0x1d8048348 + .quad 0x06d390dec,0x16cba8aca + .quad 0x037170390,0x0a3e3e02c + .quad 0x06353c1cc,0x042d98888 + .quad 0x0c4584f5c,0x0d73c7bea + .quad 0x1f16a3418,0x1329d9f7e + .quad 0x0531377e2,0x185137662 + .quad 0x1d8d9ca7c,0x1b1c69528 + .quad 0x0b25b29f2,0x18a08b5bc + .quad 0x19fb2a8b0,0x02178513a + .quad 0x1a08fe6ac,0x1da758ae0 + .quad 0x045cddf4e,0x0e0ac139e + .quad 0x1a91647f2,0x169cf9eb0 + .quad 0x1a0f717c4,0x0170076fa diff -uprN linux-2.6.32/arch/x86/crypto/crct10dif-pcl-asm_64.S linux-2.6.32.ovz/arch/x86/crypto/crct10dif-pcl-asm_64.S --- linux-2.6.32/arch/x86/crypto/crct10dif-pcl-asm_64.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/crct10dif-pcl-asm_64.S 2015-03-10 13:24:13.000000000 -0700 @@ -0,0 +1,643 @@ +######################################################################## +# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +# +# Copyright (c) 2013, Intel Corporation +# +# Authors: +# Erdinc Ozturk +# Vinodh Gopal +# James Guilford +# Tim Chen +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# OpenIB.org BSD license below: +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of the Intel Corporation nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# +# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +######################################################################## +# Function API: +# UINT16 crc_t10dif_pcl( +# UINT16 init_crc, //initial CRC value, 16 bits +# const unsigned char *buf, //buffer pointer to calculate CRC on +# UINT64 len //buffer length in bytes (64-bit data) +# ); +# +# Reference paper titled "Fast CRC Computation for Generic +# Polynomials Using PCLMULQDQ Instruction" +# URL: http://www.intel.com/content/dam/www/public/us/en/documents +# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +# +# + +#include + +.text + +#define arg1 %rdi +#define arg2 %rsi +#define arg3 %rdx + +#define arg1_low32 %edi + +ENTRY(crc_t10dif_pcl) +.align 16 + + # adjust the 16-bit initial_crc value, scale it to 32 bits + shl $16, arg1_low32 + + # Allocate Stack Space + mov %rsp, %rcx + sub $16*2, %rsp + # align stack to 16 byte boundary + and $~(0x10 - 1), %rsp + + # check if smaller than 256 + cmp $256, arg3 + + # for sizes less than 128, we can't fold 64B at a time... + jl _less_than_128 + + + # load the initial crc value + movd arg1_low32, %xmm10 # initial crc + + # crc value does not need to be byte-reflected, but it needs + # to be moved to the high part of the register. + # because data will be byte-reflected and will align with + # initial crc at correct place. + pslldq $12, %xmm10 + + movdqa SHUF_MASK(%rip), %xmm11 + # receive the initial 64B data, xor the initial crc value + movdqu 16*0(arg2), %xmm0 + movdqu 16*1(arg2), %xmm1 + movdqu 16*2(arg2), %xmm2 + movdqu 16*3(arg2), %xmm3 + movdqu 16*4(arg2), %xmm4 + movdqu 16*5(arg2), %xmm5 + movdqu 16*6(arg2), %xmm6 + movdqu 16*7(arg2), %xmm7 + + pshufb %xmm11, %xmm0 + # XOR the initial_crc value + pxor %xmm10, %xmm0 + pshufb %xmm11, %xmm1 + pshufb %xmm11, %xmm2 + pshufb %xmm11, %xmm3 + pshufb %xmm11, %xmm4 + pshufb %xmm11, %xmm5 + pshufb %xmm11, %xmm6 + pshufb %xmm11, %xmm7 + + movdqa rk3(%rip), %xmm10 #xmm10 has rk3 and rk4 + #imm value of pclmulqdq instruction + #will determine which constant to use + + ################################################################# + # we subtract 256 instead of 128 to save one instruction from the loop + sub $256, arg3 + + # at this section of the code, there is 64*x+y (0<=y<64) bytes of + # buffer. The _fold_64_B_loop will fold 64B at a time + # until we have 64+y Bytes of buffer + + + # fold 64B at a time. This section of the code folds 4 xmm + # registers in parallel +_fold_64_B_loop: + + # update the buffer pointer + add $128, arg2 # buf += 64# + + movdqu 16*0(arg2), %xmm9 + movdqu 16*1(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm0, %xmm8 + movdqa %xmm1, %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm0 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm1 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm0 + xorps %xmm8 , %xmm0 + pxor %xmm12, %xmm1 + xorps %xmm13, %xmm1 + + movdqu 16*2(arg2), %xmm9 + movdqu 16*3(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm2, %xmm8 + movdqa %xmm3, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm2 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm3 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm2 + xorps %xmm8 , %xmm2 + pxor %xmm12, %xmm3 + xorps %xmm13, %xmm3 + + movdqu 16*4(arg2), %xmm9 + movdqu 16*5(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm4, %xmm8 + movdqa %xmm5, %xmm13 + pclmulqdq $0x0, %xmm10, %xmm4 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0, %xmm10, %xmm5 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm4 + xorps %xmm8 , %xmm4 + pxor %xmm12, %xmm5 + xorps %xmm13, %xmm5 + + movdqu 16*6(arg2), %xmm9 + movdqu 16*7(arg2), %xmm12 + pshufb %xmm11, %xmm9 + pshufb %xmm11, %xmm12 + movdqa %xmm6 , %xmm8 + movdqa %xmm7 , %xmm13 + pclmulqdq $0x0 , %xmm10, %xmm6 + pclmulqdq $0x11, %xmm10, %xmm8 + pclmulqdq $0x0 , %xmm10, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm13 + pxor %xmm9 , %xmm6 + xorps %xmm8 , %xmm6 + pxor %xmm12, %xmm7 + xorps %xmm13, %xmm7 + + sub $128, arg3 + + # check if there is another 64B in the buffer to be able to fold + jge _fold_64_B_loop + ################################################################## + + + add $128, arg2 + # at this point, the buffer pointer is pointing at the last y Bytes + # of the buffer the 64B of folded data is in 4 of the xmm + # registers: xmm0, xmm1, xmm2, xmm3 + + + # fold the 8 xmm registers to 1 xmm register with different constants + + movdqa rk9(%rip), %xmm10 + movdqa %xmm0, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm0 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm0, %xmm7 + + movdqa rk11(%rip), %xmm10 + movdqa %xmm1, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm1 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm1, %xmm7 + + movdqa rk13(%rip), %xmm10 + movdqa %xmm2, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm2 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + + movdqa rk15(%rip), %xmm10 + movdqa %xmm3, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm3 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm3, %xmm7 + + movdqa rk17(%rip), %xmm10 + movdqa %xmm4, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm4 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm4, %xmm7 + + movdqa rk19(%rip), %xmm10 + movdqa %xmm5, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm5 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + xorps %xmm5, %xmm7 + + movdqa rk1(%rip), %xmm10 #xmm10 has rk1 and rk2 + #imm value of pclmulqdq instruction + #will determine which constant to use + movdqa %xmm6, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm6 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm6, %xmm7 + + + # instead of 64, we add 48 to the loop counter to save 1 instruction + # from the loop instead of a cmp instruction, we use the negative + # flag with the jl instruction + add $128-16, arg3 + jl _final_reduction_for_128 + + # now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 + # and the rest is in memory. We can fold 16 bytes at a time if y>=16 + # continue folding 16B at a time + +_16B_reduction_loop: + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + movdqu (arg2), %xmm0 + pshufb %xmm11, %xmm0 + pxor %xmm0 , %xmm7 + add $16, arg2 + sub $16, arg3 + # instead of a cmp instruction, we utilize the flags with the + # jge instruction equivalent of: cmp arg3, 16-16 + # check if there is any more 16B in the buffer to be able to fold + jge _16B_reduction_loop + + #now we have 16+z bytes left to reduce, where 0<= z < 16. + #first, we reduce the data in the xmm7 register + + +_final_reduction_for_128: + # check if any more data to fold. If not, compute the CRC of + # the final 128 bits + add $16, arg3 + je _128_done + + # here we are getting data that is less than 16 bytes. + # since we know that there was data before the pointer, we can + # offset the input pointer before the actual point, to receive + # exactly 16 bytes. after that the registers need to be adjusted. +_get_last_two_xmms: + movdqa %xmm7, %xmm2 + + movdqu -16(arg2, arg3), %xmm1 + pshufb %xmm11, %xmm1 + + # get rid of the extra data that was loaded before + # load the shift constant + lea pshufb_shf_table+16(%rip), %rax + sub arg3, %rax + movdqu (%rax), %xmm0 + + # shift xmm2 to the left by arg3 bytes + pshufb %xmm0, %xmm2 + + # shift xmm7 to the right by 16-arg3 bytes + pxor mask1(%rip), %xmm0 + pshufb %xmm0, %xmm7 + pblendvb %xmm2, %xmm1 #xmm0 is implicit + + # fold 16 Bytes + movdqa %xmm1, %xmm2 + movdqa %xmm7, %xmm8 + pclmulqdq $0x11, %xmm10, %xmm7 + pclmulqdq $0x0 , %xmm10, %xmm8 + pxor %xmm8, %xmm7 + pxor %xmm2, %xmm7 + +_128_done: + # compute crc of a 128-bit value + movdqa rk5(%rip), %xmm10 # rk5 and rk6 in xmm10 + movdqa %xmm7, %xmm0 + + #64b fold + pclmulqdq $0x1, %xmm10, %xmm7 + pslldq $8 , %xmm0 + pxor %xmm0, %xmm7 + + #32b fold + movdqa %xmm7, %xmm0 + + pand mask2(%rip), %xmm0 + + psrldq $12, %xmm7 + pclmulqdq $0x10, %xmm10, %xmm7 + pxor %xmm0, %xmm7 + + #barrett reduction +_barrett: + movdqa rk7(%rip), %xmm10 # rk7 and rk8 in xmm10 + movdqa %xmm7, %xmm0 + pclmulqdq $0x01, %xmm10, %xmm7 + pslldq $4, %xmm7 + pclmulqdq $0x11, %xmm10, %xmm7 + + pslldq $4, %xmm7 + pxor %xmm0, %xmm7 + pextrd $1, %xmm7, %eax + +_cleanup: + # scale the result back to 16 bits + shr $16, %eax + mov %rcx, %rsp + ret + +######################################################################## + +.align 16 +_less_than_128: + + # check if there is enough buffer to be able to fold 16B at a time + cmp $32, arg3 + jl _less_than_32 + movdqa SHUF_MASK(%rip), %xmm11 + + # now if there is, load the constants + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0, %xmm7 + + + # update the buffer pointer + add $16, arg2 + + # update the counter. subtract 32 instead of 16 to save one + # instruction from the loop + sub $32, arg3 + + jmp _16B_reduction_loop + + +.align 16 +_less_than_32: + # mov initial crc to the return value. this is necessary for + # zero-length buffers. + mov arg1_low32, %eax + test arg3, arg3 + je _cleanup + + movdqa SHUF_MASK(%rip), %xmm11 + + movd arg1_low32, %xmm0 # get the initial crc value + pslldq $12, %xmm0 # align it to its correct place + + cmp $16, arg3 + je _exact_16_left + jl _less_than_16_left + + movdqu (arg2), %xmm7 # load the plaintext + pshufb %xmm11, %xmm7 # byte-reflect the plaintext + pxor %xmm0 , %xmm7 # xor the initial crc value + add $16, arg2 + sub $16, arg3 + movdqa rk1(%rip), %xmm10 # rk1 and rk2 in xmm10 + jmp _get_last_two_xmms + + +.align 16 +_less_than_16_left: + # use stack space to load data less than 16 bytes, zero-out + # the 16B in memory first. + + pxor %xmm1, %xmm1 + mov %rsp, %r11 + movdqa %xmm1, (%r11) + + cmp $4, arg3 + jl _only_less_than_4 + + # backup the counter value + mov arg3, %r9 + cmp $8, arg3 + jl _less_than_8_left + + # load 8 Bytes + mov (arg2), %rax + mov %rax, (%r11) + add $8, %r11 + sub $8, arg3 + add $8, arg2 +_less_than_8_left: + + cmp $4, arg3 + jl _less_than_4_left + + # load 4 Bytes + mov (arg2), %eax + mov %eax, (%r11) + add $4, %r11 + sub $4, arg3 + add $4, arg2 +_less_than_4_left: + + cmp $2, arg3 + jl _less_than_2_left + + # load 2 Bytes + mov (arg2), %ax + mov %ax, (%r11) + add $2, %r11 + sub $2, arg3 + add $2, arg2 +_less_than_2_left: + cmp $1, arg3 + jl _zero_left + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) +_zero_left: + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + # shl r9, 4 + lea pshufb_shf_table+16(%rip), %rax + sub %r9, %rax + movdqu (%rax), %xmm0 + pxor mask1(%rip), %xmm0 + + pshufb %xmm0, %xmm7 + jmp _128_done + +.align 16 +_exact_16_left: + movdqu (arg2), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + jmp _128_done + +_only_less_than_4: + cmp $3, arg3 + jl _only_less_than_3 + + # load 3 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + mov 2(arg2), %al + mov %al, 2(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $5, %xmm7 + + jmp _barrett +_only_less_than_3: + cmp $2, arg3 + jl _only_less_than_2 + + # load 2 Bytes + mov (arg2), %al + mov %al, (%r11) + + mov 1(arg2), %al + mov %al, 1(%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $6, %xmm7 + + jmp _barrett +_only_less_than_2: + + # load 1 Byte + mov (arg2), %al + mov %al, (%r11) + + movdqa (%rsp), %xmm7 + pshufb %xmm11, %xmm7 + pxor %xmm0 , %xmm7 # xor the initial crc value + + psrldq $7, %xmm7 + + jmp _barrett + +ENDPROC(crc_t10dif_pcl) + +.data + +# precomputed constants +# these constants are precomputed from the poly: +# 0x8bb70000 (0x8bb7 scaled to 32 bits) +.align 16 +# Q = 0x18BB70000 +# rk1 = 2^(32*3) mod Q << 32 +# rk2 = 2^(32*5) mod Q << 32 +# rk3 = 2^(32*15) mod Q << 32 +# rk4 = 2^(32*17) mod Q << 32 +# rk5 = 2^(32*3) mod Q << 32 +# rk6 = 2^(32*2) mod Q << 32 +# rk7 = floor(2^64/Q) +# rk8 = Q +rk1: +.quad 0x2d56000000000000 +rk2: +.quad 0x06df000000000000 +rk3: +.quad 0x9d9d000000000000 +rk4: +.quad 0x7cf5000000000000 +rk5: +.quad 0x2d56000000000000 +rk6: +.quad 0x1368000000000000 +rk7: +.quad 0x00000001f65a57f8 +rk8: +.quad 0x000000018bb70000 + +rk9: +.quad 0xceae000000000000 +rk10: +.quad 0xbfd6000000000000 +rk11: +.quad 0x1e16000000000000 +rk12: +.quad 0x713c000000000000 +rk13: +.quad 0xf7f9000000000000 +rk14: +.quad 0x80a6000000000000 +rk15: +.quad 0x044c000000000000 +rk16: +.quad 0xe658000000000000 +rk17: +.quad 0xad18000000000000 +rk18: +.quad 0xa497000000000000 +rk19: +.quad 0x6ee3000000000000 +rk20: +.quad 0xe7b5000000000000 + + + +mask1: +.octa 0x80808080808080808080808080808080 +mask2: +.octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF + +SHUF_MASK: +.octa 0x000102030405060708090A0B0C0D0E0F + +pshufb_shf_table: +# use these values for shift constants for the pshufb instruction +# different alignments result in values as shown: +# DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1 +# DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2 +# DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3 +# DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4 +# DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5 +# DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6 +# DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7 +# DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8 +# DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9 +# DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10 +# DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11 +# DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12 +# DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13 +# DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14 +# DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15 +.octa 0x8f8e8d8c8b8a89888786858483828100 +.octa 0x000e0d0c0b0a09080706050403020100 diff -uprN linux-2.6.32/arch/x86/crypto/crct10dif-pclmul_glue.c linux-2.6.32.ovz/arch/x86/crypto/crct10dif-pclmul_glue.c --- linux-2.6.32/arch/x86/crypto/crct10dif-pclmul_glue.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/crct10dif-pclmul_glue.c 2015-03-10 13:24:13.000000000 -0700 @@ -0,0 +1,151 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions + * + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage __u16 crc_t10dif_pcl(__u16 crc, const unsigned char *buf, + size_t len); + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + if (irq_fpu_usable()) { + kernel_fpu_begin(); + ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); + kernel_fpu_end(); + } else + ctx->crc = crc_t10dif_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + if (irq_fpu_usable()) { + kernel_fpu_begin(); + *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + kernel_fpu_end(); + } else + *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-pclmul", + .cra_priority = 200, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static const struct x86_cpu_id crct10dif_cpu_id[] = { + X86_FEATURE_MATCH(X86_FEATURE_PCLMULQDQ), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); + +static int __init crct10dif_intel_mod_init(void) +{ + if (!x86_match_cpu(crct10dif_cpu_id)) + return -ENODEV; + + return crypto_register_shash(&alg); +} + +static void __exit crct10dif_intel_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_intel_mod_init); +module_exit(crct10dif_intel_mod_fini); + +MODULE_AUTHOR("Tim Chen "); +MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crct10dif"); +MODULE_ALIAS("crct10dif-pclmul"); diff -uprN linux-2.6.32/arch/x86/crypto/fpu.c linux-2.6.32.ovz/arch/x86/crypto/fpu.c --- linux-2.6.32/arch/x86/crypto/fpu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/fpu.c 2015-03-10 13:23:55.000000000 -0700 @@ -149,18 +149,12 @@ static struct crypto_template crypto_fpu .module = THIS_MODULE, }; -static int __init crypto_fpu_module_init(void) +int __init crypto_fpu_init(void) { return crypto_register_template(&crypto_fpu_tmpl); } -static void __exit crypto_fpu_module_exit(void) +void __exit crypto_fpu_exit(void) { crypto_unregister_template(&crypto_fpu_tmpl); } - -module_init(crypto_fpu_module_init); -module_exit(crypto_fpu_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("FPU block cipher wrapper"); diff -uprN linux-2.6.32/arch/x86/crypto/ghash-clmulni-intel_asm.S linux-2.6.32.ovz/arch/x86/crypto/ghash-clmulni-intel_asm.S --- linux-2.6.32/arch/x86/crypto/ghash-clmulni-intel_asm.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/ghash-clmulni-intel_asm.S 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,161 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains accelerated part of ghash + * implementation. More information about PCLMULQDQ can be found at: + * + * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * Vinodh Gopal + * Erdinc Ozturk + * Deniz Karakoyunlu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include + +.data + +.align 16 +.Lbswap_mask: + .octa 0x000102030405060708090a0b0c0d0e0f +.Lpoly: + .octa 0xc2000000000000000000000000000001 +.Ltwo_one: + .octa 0x00000001000000000000000000000001 + +#define DATA %xmm0 +#define SHASH %xmm1 +#define T1 %xmm2 +#define T2 %xmm3 +#define T3 %xmm4 +#define BSWAP %xmm5 +#define IN1 %xmm6 + +.text + +/* + * __clmul_gf128mul_ble: internal ABI + * input: + * DATA: operand1 + * SHASH: operand2, hash_key << 1 mod poly + * output: + * DATA: operand1 * operand2 mod poly + * changed: + * T1 + * T2 + * T3 + */ +__clmul_gf128mul_ble: + movaps DATA, T1 + pshufd $0b01001110, DATA, T2 + pshufd $0b01001110, SHASH, T3 + pxor DATA, T2 + pxor SHASH, T3 + + PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0 + PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1 + PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0) + pxor DATA, T2 + pxor T1, T2 # T2 = a0 * b1 + a1 * b0 + + movaps T2, T3 + pslldq $8, T3 + psrldq $8, T2 + pxor T3, DATA + pxor T2, T1 # is result of + # carry-less multiplication + + # first phase of the reduction + movaps DATA, T3 + psllq $1, T3 + pxor DATA, T3 + psllq $5, T3 + pxor DATA, T3 + psllq $57, T3 + movaps T3, T2 + pslldq $8, T2 + psrldq $8, T3 + pxor T2, DATA + pxor T3, T1 + + # second phase of the reduction + movaps DATA, T2 + psrlq $5, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor DATA, T2 + psrlq $1, T2 + pxor T2, T1 + pxor T1, DATA + ret +ENDPROC(__clmul_gf128mul_ble) + +/* void clmul_ghash_mul(char *dst, const be128 *shash) */ +ENTRY(clmul_ghash_mul) + movups (%rdi), DATA + movups (%rsi), SHASH + movaps .Lbswap_mask, BSWAP + PSHUFB_XMM BSWAP DATA + call __clmul_gf128mul_ble + PSHUFB_XMM BSWAP DATA + movups DATA, (%rdi) + ret +ENDPROC(clmul_ghash_mul) + +/* + * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + * const be128 *shash); + */ +ENTRY(clmul_ghash_update) + cmp $16, %rdx + jb .Lupdate_just_ret # check length + movaps .Lbswap_mask, BSWAP + movups (%rdi), DATA + movups (%rcx), SHASH + PSHUFB_XMM BSWAP DATA +.align 4 +.Lupdate_loop: + movups (%rsi), IN1 + PSHUFB_XMM BSWAP IN1 + pxor IN1, DATA + call __clmul_gf128mul_ble + sub $16, %rdx + add $16, %rsi + cmp $16, %rdx + jge .Lupdate_loop + PSHUFB_XMM BSWAP DATA + movups DATA, (%rdi) +.Lupdate_just_ret: + ret +ENDPROC(clmul_ghash_update) + +/* + * void clmul_ghash_setkey(be128 *shash, const u8 *key); + * + * Calculate hash_key << 1 mod poly + */ +ENTRY(clmul_ghash_setkey) + movaps .Lbswap_mask, BSWAP + movups (%rsi), %xmm0 + PSHUFB_XMM BSWAP %xmm0 + movaps %xmm0, %xmm1 + psllq $1, %xmm0 + psrlq $63, %xmm1 + movaps %xmm1, %xmm2 + pslldq $8, %xmm1 + psrldq $8, %xmm2 + por %xmm1, %xmm0 + # reduction + pshufd $0b00100100, %xmm2, %xmm1 + pcmpeqd .Ltwo_one, %xmm1 + pand .Lpoly, %xmm1 + pxor %xmm1, %xmm0 + movups %xmm0, (%rdi) + ret +ENDPROC(clmul_ghash_setkey) diff -uprN linux-2.6.32/arch/x86/crypto/ghash-clmulni-intel_glue.c linux-2.6.32.ovz/arch/x86/crypto/ghash-clmulni-intel_glue.c --- linux-2.6.32/arch/x86/crypto/ghash-clmulni-intel_glue.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/ghash-clmulni-intel_glue.c 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,334 @@ +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains glue code. + * + * Copyright (c) 2009 Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define GHASH_BLOCK_SIZE 16 +#define GHASH_DIGEST_SIZE 16 + +void clmul_ghash_mul(char *dst, const be128 *shash); + +void clmul_ghash_update(char *dst, const char *src, unsigned int srclen, + const be128 *shash); + +void clmul_ghash_setkey(be128 *shash, const u8 *key); + +struct ghash_async_ctx { + struct cryptd_ahash *cryptd_tfm; +}; + +struct ghash_ctx { + be128 shash; +}; + +struct ghash_desc_ctx { + u8 buffer[GHASH_BLOCK_SIZE]; + u32 bytes; +}; + +static int ghash_init(struct shash_desc *desc) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + + memset(dctx, 0, sizeof(*dctx)); + + return 0; +} + +static int ghash_setkey(struct crypto_shash *tfm, + const u8 *key, unsigned int keylen) +{ + struct ghash_ctx *ctx = crypto_shash_ctx(tfm); + + if (keylen != GHASH_BLOCK_SIZE) { + crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + + clmul_ghash_setkey(&ctx->shash, key); + + return 0; +} + +static int ghash_update(struct shash_desc *desc, + const u8 *src, unsigned int srclen) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *dst = dctx->buffer; + + kernel_fpu_begin(); + if (dctx->bytes) { + int n = min(srclen, dctx->bytes); + u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + dctx->bytes -= n; + srclen -= n; + + while (n--) + *pos++ ^= *src++; + + if (!dctx->bytes) + clmul_ghash_mul(dst, &ctx->shash); + } + + clmul_ghash_update(dst, src, srclen, &ctx->shash); + kernel_fpu_end(); + + if (srclen & 0xf) { + src += srclen - (srclen & 0xf); + srclen &= 0xf; + dctx->bytes = GHASH_BLOCK_SIZE - srclen; + while (srclen--) + *dst++ ^= *src++; + } + + return 0; +} + +static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx) +{ + u8 *dst = dctx->buffer; + + if (dctx->bytes) { + u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes); + + while (dctx->bytes--) + *tmp++ ^= 0; + + kernel_fpu_begin(); + clmul_ghash_mul(dst, &ctx->shash); + kernel_fpu_end(); + } + + dctx->bytes = 0; +} + +static int ghash_final(struct shash_desc *desc, u8 *dst) +{ + struct ghash_desc_ctx *dctx = shash_desc_ctx(desc); + struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); + u8 *buf = dctx->buffer; + + ghash_flush(ctx, dctx); + memcpy(dst, buf, GHASH_BLOCK_SIZE); + + return 0; +} + +static struct shash_alg ghash_alg = { + .digestsize = GHASH_DIGEST_SIZE, + .init = ghash_init, + .update = ghash_update, + .final = ghash_final, + .setkey = ghash_setkey, + .descsize = sizeof(struct ghash_desc_ctx), + .base = { + .cra_name = "__ghash", + .cra_driver_name = "__ghash-pclmulqdqni", + .cra_priority = 0, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct ghash_ctx), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list), + }, +}; + +static int ghash_async_init(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (!irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_init(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return crypto_shash_init(desc); + } +} + +static int ghash_async_update(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (!irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_update(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return shash_ahash_update(req, desc); + } +} + +static int ghash_async_final(struct ahash_request *req) +{ + struct ahash_request *cryptd_req = ahash_request_ctx(req); + + if (!irq_fpu_usable()) { + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_final(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + return crypto_shash_final(desc, req->result); + } +} + +static int ghash_async_digest(struct ahash_request *req) +{ + struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct ahash_request *cryptd_req = ahash_request_ctx(req); + struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; + + if (!irq_fpu_usable()) { + memcpy(cryptd_req, req, sizeof(*req)); + ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); + return crypto_ahash_digest(cryptd_req); + } else { + struct shash_desc *desc = cryptd_shash_desc(cryptd_req); + struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); + + desc->tfm = child; + desc->flags = req->base.flags; + return shash_ahash_digest(req, desc); + } +} + +static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key, + unsigned int keylen) +{ + struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); + struct crypto_ahash *child = &ctx->cryptd_tfm->base; + int err; + + crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm) + & CRYPTO_TFM_REQ_MASK); + err = crypto_ahash_setkey(child, key, keylen); + crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child) + & CRYPTO_TFM_RES_MASK); + + return err; +} + +static int ghash_async_init_tfm(struct crypto_tfm *tfm) +{ + struct cryptd_ahash *cryptd_tfm; + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0); + if (IS_ERR(cryptd_tfm)) + return PTR_ERR(cryptd_tfm); + ctx->cryptd_tfm = cryptd_tfm; + crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm), + sizeof(struct ahash_request) + + crypto_ahash_reqsize(&cryptd_tfm->base)); + + return 0; +} + +static void ghash_async_exit_tfm(struct crypto_tfm *tfm) +{ + struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm); + + cryptd_free_ahash(ctx->cryptd_tfm); +} + +static struct ahash_alg ghash_async_alg = { + .init = ghash_async_init, + .update = ghash_async_update, + .final = ghash_async_final, + .setkey = ghash_async_setkey, + .digest = ghash_async_digest, + .halg = { + .digestsize = GHASH_DIGEST_SIZE, + .base = { + .cra_name = "ghash", + .cra_driver_name = "ghash-clmulni", + .cra_priority = 400, + .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC, + .cra_blocksize = GHASH_BLOCK_SIZE, + .cra_type = &crypto_ahash_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list), + .cra_init = ghash_async_init_tfm, + .cra_exit = ghash_async_exit_tfm, + }, + }, +}; + +static int __init ghash_pclmulqdqni_mod_init(void) +{ + int err; + + if (!cpu_has_pclmulqdq) { + printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not" + " detected.\n"); + return -ENODEV; + } + + err = crypto_register_shash(&ghash_alg); + if (err) + goto err_out; + err = crypto_register_ahash(&ghash_async_alg); + if (err) + goto err_shash; + + return 0; + +err_shash: + crypto_unregister_shash(&ghash_alg); +err_out: + return err; +} + +static void __exit ghash_pclmulqdqni_mod_exit(void) +{ + crypto_unregister_ahash(&ghash_async_alg); + crypto_unregister_shash(&ghash_alg); +} + +module_init(ghash_pclmulqdqni_mod_init); +module_exit(ghash_pclmulqdqni_mod_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("GHASH Message Digest Algorithm, " + "acclerated by PCLMULQDQ-NI"); +MODULE_ALIAS("ghash"); diff -uprN linux-2.6.32/arch/x86/crypto/glue_helper-asm-avx.S linux-2.6.32.ovz/arch/x86/crypto/glue_helper-asm-avx.S --- linux-2.6.32/arch/x86/crypto/glue_helper-asm-avx.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/glue_helper-asm-avx.S 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,150 @@ +/* + * Shared glue code for 128bit block ciphers, AVX assembler macros + * + * Copyright © 2012-2013 Jussi Kivilinna + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define load_8way(src, x0, x1, x2, x3, x4, x5, x6, x7) \ + vmovdqu (0*16)(src), x0; \ + vmovdqu (1*16)(src), x1; \ + vmovdqu (2*16)(src), x2; \ + vmovdqu (3*16)(src), x3; \ + vmovdqu (4*16)(src), x4; \ + vmovdqu (5*16)(src), x5; \ + vmovdqu (6*16)(src), x6; \ + vmovdqu (7*16)(src), x7; + +#define store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vmovdqu x0, (0*16)(dst); \ + vmovdqu x1, (1*16)(dst); \ + vmovdqu x2, (2*16)(dst); \ + vmovdqu x3, (3*16)(dst); \ + vmovdqu x4, (4*16)(dst); \ + vmovdqu x5, (5*16)(dst); \ + vmovdqu x6, (6*16)(dst); \ + vmovdqu x7, (7*16)(dst); + +#define store_cbc_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vpxor (0*16)(src), x1, x1; \ + vpxor (1*16)(src), x2, x2; \ + vpxor (2*16)(src), x3, x3; \ + vpxor (3*16)(src), x4, x4; \ + vpxor (4*16)(src), x5, x5; \ + vpxor (5*16)(src), x6, x6; \ + vpxor (6*16)(src), x7, x7; \ + store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); + +#define inc_le128(x, minus_one, tmp) \ + vpcmpeqq minus_one, x, tmp; \ + vpsubq minus_one, x, x; \ + vpslldq $8, tmp, tmp; \ + vpsubq tmp, x, x; + +#define load_ctr_8way(iv, bswap, x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2) \ + vpcmpeqd t0, t0, t0; \ + vpsrldq $8, t0, t0; /* low: -1, high: 0 */ \ + vmovdqa bswap, t1; \ + \ + /* load IV and byteswap */ \ + vmovdqu (iv), x7; \ + vpshufb t1, x7, x0; \ + \ + /* construct IVs */ \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x1; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x2; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x3; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x4; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x5; \ + inc_le128(x7, t0, t2); \ + vpshufb t1, x7, x6; \ + inc_le128(x7, t0, t2); \ + vmovdqa x7, t2; \ + vpshufb t1, x7, x7; \ + inc_le128(t2, t0, t1); \ + vmovdqu t2, (iv); + +#define store_ctr_8way(src, dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vpxor (0*16)(src), x0, x0; \ + vpxor (1*16)(src), x1, x1; \ + vpxor (2*16)(src), x2, x2; \ + vpxor (3*16)(src), x3, x3; \ + vpxor (4*16)(src), x4, x4; \ + vpxor (5*16)(src), x5, x5; \ + vpxor (6*16)(src), x6, x6; \ + vpxor (7*16)(src), x7, x7; \ + store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); + +#define gf128mul_x_ble(iv, mask, tmp) \ + vpsrad $31, iv, tmp; \ + vpaddq iv, iv, iv; \ + vpshufd $0x13, tmp, tmp; \ + vpand mask, tmp, tmp; \ + vpxor tmp, iv, iv; + +#define load_xts_8way(iv, src, dst, x0, x1, x2, x3, x4, x5, x6, x7, tiv, t0, \ + t1, xts_gf128mul_and_shl1_mask) \ + vmovdqa xts_gf128mul_and_shl1_mask, t0; \ + \ + /* load IV */ \ + vmovdqu (iv), tiv; \ + vpxor (0*16)(src), tiv, x0; \ + vmovdqu tiv, (0*16)(dst); \ + \ + /* construct and store IVs, also xor with source */ \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (1*16)(src), tiv, x1; \ + vmovdqu tiv, (1*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (2*16)(src), tiv, x2; \ + vmovdqu tiv, (2*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (3*16)(src), tiv, x3; \ + vmovdqu tiv, (3*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (4*16)(src), tiv, x4; \ + vmovdqu tiv, (4*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (5*16)(src), tiv, x5; \ + vmovdqu tiv, (5*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (6*16)(src), tiv, x6; \ + vmovdqu tiv, (6*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vpxor (7*16)(src), tiv, x7; \ + vmovdqu tiv, (7*16)(dst); \ + \ + gf128mul_x_ble(tiv, t0, t1); \ + vmovdqu tiv, (iv); + +#define store_xts_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7) \ + vpxor (0*16)(dst), x0, x0; \ + vpxor (1*16)(dst), x1, x1; \ + vpxor (2*16)(dst), x2, x2; \ + vpxor (3*16)(dst), x3, x3; \ + vpxor (4*16)(dst), x4, x4; \ + vpxor (5*16)(dst), x5, x5; \ + vpxor (6*16)(dst), x6, x6; \ + vpxor (7*16)(dst), x7, x7; \ + store_8way(dst, x0, x1, x2, x3, x4, x5, x6, x7); diff -uprN linux-2.6.32/arch/x86/crypto/glue_helper.c linux-2.6.32.ovz/arch/x86/crypto/glue_helper.c --- linux-2.6.32/arch/x86/crypto/glue_helper.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/glue_helper.c 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,402 @@ +/* + * Shared glue code for 128bit block ciphers + * + * Copyright © 2012-2013 Jussi Kivilinna + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include +#include +#include +#include +#include +#include + +static int __glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes, i, func_bytes; + bool fpu_enabled = false; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + + for (i = 0; i < gctx->num_funcs; i++) { + func_bytes = bsize * gctx->funcs[i].num_blocks; + + /* Process multi-block batch */ + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.ecb(ctx, wdst, + wsrc); + + wsrc += func_bytes; + wdst += func_bytes; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + return err; +} + +int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return __glue_ecb_crypt_128bit(gctx, desc, &walk); +} +EXPORT_SYMBOL_GPL(glue_ecb_crypt_128bit); + +static unsigned int __glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + fn(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +int glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __glue_cbc_encrypt_128bit(fn, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} +EXPORT_SYMBOL_GPL(glue_cbc_encrypt_128bit); + +static unsigned int +__glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 last_iv; + unsigned int num_blocks, func_bytes; + unsigned int i; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + /* Process multi-block batch */ + if (nbytes >= func_bytes) { + do { + nbytes -= func_bytes - bsize; + src -= num_blocks - 1; + dst -= num_blocks - 1; + + gctx->funcs[i].fn_u.cbc(ctx, dst, src); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + const unsigned int bsize = 128 / 8; + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + nbytes = __glue_cbc_decrypt_128bit(gctx, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + return err; +} +EXPORT_SYMBOL_GPL(glue_cbc_decrypt_128bit); + +static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + void *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *src = (u8 *)walk->src.virt.addr; + u8 *dst = (u8 *)walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + le128 ctrblk; + u128 tmp; + + be128_to_le128(&ctrblk, (be128 *)walk->iv); + + memcpy(&tmp, src, nbytes); + fn_ctr(ctx, &tmp, &tmp, &ctrblk); + memcpy(dst, &tmp, nbytes); + + le128_to_be128((be128 *)walk->iv, &ctrblk); +} +EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit); + +static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + const unsigned int bsize = 128 / 8; + void *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + le128 ctrblk; + unsigned int num_blocks, func_bytes; + unsigned int i; + + be128_to_le128(&ctrblk, (be128 *)walk->iv); + + /* Process multi-block batch */ + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.ctr(ctx, dst, src, &ctrblk); + + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + le128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + const unsigned int bsize = 128 / 8; + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, bsize); + + while ((nbytes = walk.nbytes) >= bsize) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, nbytes); + nbytes = __glue_ctr_crypt_128bit(gctx, desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + glue_fpu_end(fpu_enabled); + + if (walk.nbytes) { + glue_ctr_crypt_final_128bit( + gctx->funcs[gctx->num_funcs - 1].fn_u.ctr, desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} +EXPORT_SYMBOL_GPL(glue_ctr_crypt_128bit); + +static unsigned int __glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + void *ctx, + struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + const unsigned int bsize = 128 / 8; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + unsigned int num_blocks, func_bytes; + unsigned int i; + + /* Process multi-block batch */ + for (i = 0; i < gctx->num_funcs; i++) { + num_blocks = gctx->funcs[i].num_blocks; + func_bytes = bsize * num_blocks; + + if (nbytes >= func_bytes) { + do { + gctx->funcs[i].fn_u.xts(ctx, dst, src, + (le128 *)walk->iv); + + src += num_blocks; + dst += num_blocks; + nbytes -= func_bytes; + } while (nbytes >= func_bytes); + + if (nbytes < bsize) + goto done; + } + } + +done: + return nbytes; +} + +/* for implementations implementing faster XTS IV generator */ +int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes, + void (*tweak_fn)(void *ctx, u8 *dst, const u8 *src), + void *tweak_ctx, void *crypt_ctx) +{ + const unsigned int bsize = 128 / 8; + bool fpu_enabled = false; + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + + err = blkcipher_walk_virt(desc, &walk); + nbytes = walk.nbytes; + if (!nbytes) + return err; + + /* set minimum length to bsize, for tweak_fn */ + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + desc, fpu_enabled, + nbytes < bsize ? bsize : nbytes); + + /* calculate first value of T */ + tweak_fn(tweak_ctx, walk.iv, walk.iv); + + while (nbytes) { + nbytes = __glue_xts_crypt_128bit(gctx, crypt_ctx, desc, &walk); + + err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = walk.nbytes; + } + + glue_fpu_end(fpu_enabled); + + return err; +} +EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit); + +void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, le128 *iv, + common_glue_func_t fn) +{ + le128 ivblk = *iv; + + /* generate next IV */ + le128_gf128mul_x_ble(iv, &ivblk); + + /* CC <- T xor C */ + u128_xor(dst, src, (u128 *)&ivblk); + + /* PP <- D(Key2,CC) */ + fn(ctx, (u8 *)dst, (u8 *)dst); + + /* P <- T xor PP */ + u128_xor(dst, dst, (u128 *)&ivblk); +} +EXPORT_SYMBOL_GPL(glue_xts_crypt_128bit_one); + +MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/x86/crypto/Makefile linux-2.6.32.ovz/arch/x86/crypto/Makefile --- linux-2.6.32/arch/x86/crypto/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/Makefile 2015-06-23 04:16:16.000000000 -0700 @@ -2,7 +2,8 @@ # Arch-specific CryptoAPI modules. # -obj-$(CONFIG_CRYPTO_FPU) += fpu.o +obj-$(CONFIG_CRYPTO_ABLK_HELPER_X86) += ablk_helper.o +obj-$(CONFIG_CRYPTO_GLUE_HELPER_X86) += glue_helper.o obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o @@ -12,8 +13,12 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o +obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o +obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o + +obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o @@ -23,4 +28,18 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o -aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o +aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o + +ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o +crc32c-intel-y := crc32c-intel_glue.o +crc32c-intel-$(CONFIG_64BIT) += crc32c-pcl-intel-asm_64.o + +# enable AVX support only when $(AS) can actually assemble the instructions +ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes) +AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT +CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT +endif +sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o + +crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o +crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o diff -uprN linux-2.6.32/arch/x86/crypto/sha1_ssse3_asm.S linux-2.6.32.ovz/arch/x86/crypto/sha1_ssse3_asm.S --- linux-2.6.32/arch/x86/crypto/sha1_ssse3_asm.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/sha1_ssse3_asm.S 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,558 @@ +/* + * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental + * SSE3 instruction set extensions introduced in Intel Core Microarchitecture + * processors. CPUs supporting Intel(R) AVX extensions will get an additional + * boost. + * + * This work was inspired by the vectorized implementation of Dean Gaudet. + * Additional information on it can be found at: + * http://www.arctic.org/~dean/crypto/sha1.html + * + * It was improved upon with more efficient vectorization of the message + * scheduling. This implementation has also been optimized for all current and + * several future generations of Intel CPUs. + * + * See this article for more information about the implementation details: + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * Copyright (C) 2010, Intel Corp. + * Authors: Maxim Locktyukhin + * Ronen Zohar + * + * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: + * Author: Mathias Krause + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#define CTX %rdi // arg1 +#define BUF %rsi // arg2 +#define CNT %rdx // arg3 + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %ebp +#define REG_E %edx + +#define REG_T1 %eax +#define REG_T2 %ebx + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_END %r11 + +#define W_TMP1 %xmm0 +#define W_TMP2 %xmm9 + +#define W0 %xmm1 +#define W4 %xmm2 +#define W8 %xmm3 +#define W12 %xmm4 +#define W16 %xmm5 +#define W20 %xmm6 +#define W24 %xmm7 +#define W28 %xmm8 + +#define XMM_SHUFB_BSWAP %xmm10 + +/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ +#define WK(t) (((t) & 15) * 4)(%rsp) +#define W_PRECALC_AHEAD 16 + +/* + * This macro implements the SHA-1 function's body for single 64-byte block + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + .global \name + .type \name, @function + .align 32 +\name: + push %rbx + push %rbp + push %r12 + + mov %rsp, %r12 + sub $64, %rsp # allocate workspace + and $~15, %rsp # align stack + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + shl $6, CNT # multiply by 64 + add BUF, CNT + mov CNT, BUFFER_END + + lea K_XMM_AR(%rip), K_BASE + xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + # cleanup workspace + mov $8, %ecx + mov %rsp, %rdi + xor %rax, %rax + rep stosq + + mov %r12, %rsp # deallocate workspace + + pop %r12 + pop %rbp + pop %rbx + ret + + .size \name, .-\name +.endm + +/* + * This macro implements 80 rounds of SHA-1 for one 64-byte block + */ +.macro SHA1_PIPELINED_MAIN_BODY + INIT_REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + .set i, 0 + .rept W_PRECALC_AHEAD + W_PRECALC i + .set i, (i+1) + .endr + +.align 4 +1: + RR F1,A,B,C,D,E,0 + RR F1,D,E,A,B,C,2 + RR F1,B,C,D,E,A,4 + RR F1,E,A,B,C,D,6 + RR F1,C,D,E,A,B,8 + + RR F1,A,B,C,D,E,10 + RR F1,D,E,A,B,C,12 + RR F1,B,C,D,E,A,14 + RR F1,E,A,B,C,D,16 + RR F1,C,D,E,A,B,18 + + RR F2,A,B,C,D,E,20 + RR F2,D,E,A,B,C,22 + RR F2,B,C,D,E,A,24 + RR F2,E,A,B,C,D,26 + RR F2,C,D,E,A,B,28 + + RR F2,A,B,C,D,E,30 + RR F2,D,E,A,B,C,32 + RR F2,B,C,D,E,A,34 + RR F2,E,A,B,C,D,36 + RR F2,C,D,E,A,B,38 + + RR F3,A,B,C,D,E,40 + RR F3,D,E,A,B,C,42 + RR F3,B,C,D,E,A,44 + RR F3,E,A,B,C,D,46 + RR F3,C,D,E,A,B,48 + + RR F3,A,B,C,D,E,50 + RR F3,D,E,A,B,C,52 + RR F3,B,C,D,E,A,54 + RR F3,E,A,B,C,D,56 + RR F3,C,D,E,A,B,58 + + add $64, BUFFER_PTR # move to the next 64-byte block + cmp BUFFER_END, BUFFER_PTR # if the current is the last one use + cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun + + RR F4,A,B,C,D,E,60 + RR F4,D,E,A,B,C,62 + RR F4,B,C,D,E,A,64 + RR F4,E,A,B,C,D,66 + RR F4,C,D,E,A,B,68 + + RR F4,A,B,C,D,E,70 + RR F4,D,E,A,B,C,72 + RR F4,B,C,D,E,A,74 + RR F4,E,A,B,C,D,76 + RR F4,C,D,E,A,B,78 + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), B + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + RESTORE_RENAMED_REGS + cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end + jne 1b +.endm + +.macro INIT_REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set T1, REG_T1 + .set T2, REG_T2 +.endm + +.macro RESTORE_RENAMED_REGS + # order is important (REG_C is where it should be) + mov B, REG_B + mov D, REG_D + mov A, REG_A + mov E, REG_E +.endm + +.macro SWAP_REG_NAMES a, b + .set _T, \a + .set \a, \b + .set \b, _T +.endm + +.macro F1 b, c, d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + xor \d, T1 + and \b, T1 + xor \d, T1 +.endm + +.macro F2 b, c, d + mov \d, T1 + SWAP_REG_NAMES \d, T1 + xor \c, T1 + xor \b, T1 +.endm + +.macro F3 b, c ,d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + mov \b, T2 + or \b, T1 + and \c, T2 + and \d, T1 + or T2, T1 +.endm + +.macro F4 b, c, d + F2 \b, \c, \d +.endm + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +/* + * RR does two rounds of SHA-1 back to back with W[] pre-calc + * t1 = F(b, c, d); e += w(i) + * e += t1; b <<= 30; d += w(i+1); + * t1 = F(a, b, c); + * d += t1; a <<= 5; + * e += a; + * t1 = e; a >>= 7; + * t1 <<= 5; + * d += t1; + */ +.macro RR F, a, b, c, d, e, round + add WK(\round), \e + \F \b, \c, \d # t1 = F(b, c, d); + W_PRECALC (\round + W_PRECALC_AHEAD) + rol $30, \b + add T1, \e + add WK(\round + 1), \d + + \F \a, \b, \c + W_PRECALC (\round + W_PRECALC_AHEAD + 1) + rol $5, \a + add \a, \e + add T1, \d + ror $7, \a # (a <>r 7) => a <= 80) && (i < (80 + W_PRECALC_AHEAD)))) + .set i, ((\r) % 80) # pre-compute for the next iteration + .if (i == 0) + W_PRECALC_RESET + .endif + W_PRECALC_00_15 + .elseif (i<32) + W_PRECALC_16_31 + .elseif (i < 80) // rounds 32-79 + W_PRECALC_32_79 + .endif +.endm + +.macro W_PRECALC_RESET + .set W, W0 + .set W_minus_04, W4 + .set W_minus_08, W8 + .set W_minus_12, W12 + .set W_minus_16, W16 + .set W_minus_20, W20 + .set W_minus_24, W24 + .set W_minus_28, W28 + .set W_minus_32, W +.endm + +.macro W_PRECALC_ROTATE + .set W_minus_32, W_minus_28 + .set W_minus_28, W_minus_24 + .set W_minus_24, W_minus_20 + .set W_minus_20, W_minus_16 + .set W_minus_16, W_minus_12 + .set W_minus_12, W_minus_08 + .set W_minus_08, W_minus_04 + .set W_minus_04, W + .set W, W_minus_32 +.endm + +.macro W_PRECALC_SSSE3 + +.macro W_PRECALC_00_15 + W_PRECALC_00_15_SSSE3 +.endm +.macro W_PRECALC_16_31 + W_PRECALC_16_31_SSSE3 +.endm +.macro W_PRECALC_32_79 + W_PRECALC_32_79_SSSE3 +.endm + +/* message scheduling pre-compute for rounds 0-15 */ +.macro W_PRECALC_00_15_SSSE3 + .if ((i & 3) == 0) + movdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + pshufb XMM_SHUFB_BSWAP, W_TMP1 + movdqa W_TMP1, W + .elseif ((i & 3) == 2) + paddd (K_BASE), W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 16-31 + * + * - calculating last 32 w[i] values in 8 XMM registers + * - pre-calculate K+w[i] values and store to mem, for later load by ALU add + * instruction + * + * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] + * dependency, but improves for 32-79 + */ +.macro W_PRECALC_16_31_SSSE3 + # blended scheduling of vector and scalar instruction streams, one 4-wide + # vector iteration / 4 scalar rounds + .if ((i & 3) == 0) + movdqa W_minus_12, W + palignr $8, W_minus_16, W # w[i-14] + movdqa W_minus_04, W_TMP1 + psrldq $4, W_TMP1 # w[i-3] + pxor W_minus_08, W + .elseif ((i & 3) == 1) + pxor W_minus_16, W_TMP1 + pxor W_TMP1, W + movdqa W, W_TMP2 + movdqa W, W_TMP1 + pslldq $12, W_TMP2 + .elseif ((i & 3) == 2) + psrld $31, W + pslld $1, W_TMP1 + por W, W_TMP1 + movdqa W_TMP2, W + psrld $30, W_TMP2 + pslld $2, W + .elseif ((i & 3) == 3) + pxor W, W_TMP1 + pxor W_TMP2, W_TMP1 + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 32-79 + * + * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + */ +.macro W_PRECALC_32_79_SSSE3 + .if ((i & 3) == 0) + movdqa W_minus_04, W_TMP1 + pxor W_minus_28, W # W is W_minus_32 before xor + palignr $8, W_minus_08, W_TMP1 + .elseif ((i & 3) == 1) + pxor W_minus_16, W + pxor W_TMP1, W + movdqa W, W_TMP1 + .elseif ((i & 3) == 2) + psrld $30, W + pslld $2, W_TMP1 + por W, W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_SSSE3 + + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.section .rodata +.align 16 + +K_XMM_AR: + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + + +.section .text + +W_PRECALC_SSSE3 +.macro xmm_mov a, b + movdqu \a,\b +.endm + +/* SSSE3 optimized implementation: + * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, + * unsigned int rounds); + */ +SHA1_VECTOR_ASM sha1_transform_ssse3 + +#ifdef SHA1_ENABLE_AVX_SUPPORT + +.macro W_PRECALC_AVX + +.purgem W_PRECALC_00_15 +.macro W_PRECALC_00_15 + W_PRECALC_00_15_AVX +.endm +.purgem W_PRECALC_16_31 +.macro W_PRECALC_16_31 + W_PRECALC_16_31_AVX +.endm +.purgem W_PRECALC_32_79 +.macro W_PRECALC_32_79 + W_PRECALC_32_79_AVX +.endm + +.macro W_PRECALC_00_15_AVX + .if ((i & 3) == 0) + vmovdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + vpshufb XMM_SHUFB_BSWAP, W_TMP1, W + .elseif ((i & 3) == 2) + vpaddd (K_BASE), W, W_TMP1 + .elseif ((i & 3) == 3) + vmovdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_16_31_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] + vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] + vpxor W_minus_08, W, W + vpxor W_minus_16, W_TMP1, W_TMP1 + .elseif ((i & 3) == 1) + vpxor W_TMP1, W, W + vpslldq $12, W, W_TMP2 + vpslld $1, W, W_TMP1 + .elseif ((i & 3) == 2) + vpsrld $31, W, W + vpor W, W_TMP1, W_TMP1 + vpslld $2, W_TMP2, W + vpsrld $30, W_TMP2, W_TMP2 + .elseif ((i & 3) == 3) + vpxor W, W_TMP1, W_TMP1 + vpxor W_TMP2, W_TMP1, W + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_32_79_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_08, W_minus_04, W_TMP1 + vpxor W_minus_28, W, W # W is W_minus_32 before xor + .elseif ((i & 3) == 1) + vpxor W_minus_16, W_TMP1, W_TMP1 + vpxor W_TMP1, W, W + .elseif ((i & 3) == 2) + vpslld $2, W, W_TMP1 + vpsrld $30, W, W + vpor W, W_TMP1, W + .elseif ((i & 3) == 3) + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_AVX + +W_PRECALC_AVX +.purgem xmm_mov +.macro xmm_mov a, b + vmovdqu \a,\b +.endm + + +/* AVX optimized implementation: + * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, + * unsigned int rounds); + */ +SHA1_VECTOR_ASM sha1_transform_avx + +#endif diff -uprN linux-2.6.32/arch/x86/crypto/sha1_ssse3_glue.c linux-2.6.32.ovz/arch/x86/crypto/sha1_ssse3_glue.c --- linux-2.6.32/arch/x86/crypto/sha1_ssse3_glue.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/crypto/sha1_ssse3_glue.c 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,258 @@ +/* + * Cryptographic API. + * + * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using + * Supplemental SSE3 instructions. + * + * This file is based on sha1_generic.c + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald + * Copyright (c) Jean-Francois Dive + * Copyright (c) Mathias Krause + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, + unsigned int rounds); +#ifdef SHA1_ENABLE_AVX_SUPPORT +asmlinkage void sha1_transform_avx(u32 *digest, const char *data, + unsigned int rounds); +#endif + +static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); + + +static int sha1_ssse3_init(struct shash_desc *desc) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + *sctx = (struct sha1_state){ + .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, + }; + + return 0; +} + +static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len, unsigned int partial) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int done = 0; + + sctx->count += len; + + if (partial) { + done = SHA1_BLOCK_SIZE - partial; + memcpy(sctx->buffer + partial, data, done); + sha1_transform_asm(sctx->state, sctx->buffer, 1); + } + + if (len - done >= SHA1_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; + + sha1_transform_asm(sctx->state, data + done, rounds); + done += rounds * SHA1_BLOCK_SIZE; + } + + memcpy(sctx->buffer, data + done, len - done); + + return 0; +} + +static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; + int res; + + /* Handle the fast case right here */ + if (partial + len < SHA1_BLOCK_SIZE) { + sctx->count += len; + memcpy(sctx->buffer + partial, data, len); + + return 0; + } + + if (!irq_fpu_usable()) { + res = crypto_sha1_update(desc, data, len); + } else { + kernel_fpu_begin(); + res = __sha1_ssse3_update(desc, data, len, partial); + kernel_fpu_end(); + } + + return res; +} + + +/* Add padding and return the message digest. */ +static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be32 *dst = (__be32 *)out; + __be64 bits; + static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; + + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->count % SHA1_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); + if (!irq_fpu_usable()) { + crypto_sha1_update(desc, padding, padlen); + crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); + } else { + kernel_fpu_begin(); + /* We need to fill a whole block for __sha1_ssse3_update() */ + if (padlen <= 56) { + sctx->count += padlen; + memcpy(sctx->buffer + index, padding, padlen); + } else { + __sha1_ssse3_update(desc, padding, padlen, index); + } + __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56); + kernel_fpu_end(); + } + + /* Store state in digest */ + for (i = 0; i < 5; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha1_ssse3_export(struct shash_desc *desc, void *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha1_ssse3_import(struct shash_desc *desc, const void *in) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static struct shash_alg alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_ssse3_init, + .update = sha1_ssse3_update, + .final = sha1_ssse3_final, + .export = sha1_ssse3_export, + .import = sha1_ssse3_import, + .descsize = sizeof(struct sha1_state), + .statesize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name= "sha1-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +#ifdef SHA1_ENABLE_AVX_SUPPORT +static bool __init avx_usable(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) + return false; + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + + return false; + } + + return true; +} +#endif + +static void sha1_batch_ssse3(u32 *digest, const char *data, unsigned rounds) +{ + sha1_transform_ssse3(digest, data, rounds); +} + +#ifdef SHA1_ENABLE_AVX_SUPPORT +static void sha1_batch_avx(u32 *digest, const char *data, unsigned rounds) +{ + sha1_transform_avx(digest, data, rounds); +} +#endif + +static int __init sha1_ssse3_mod_init(void) +{ + int ret; + + /* test for SSSE3 first */ + if (cpu_has_ssse3) + sha1_transform_asm = sha1_transform_ssse3; + +#ifdef SHA1_ENABLE_AVX_SUPPORT + /* allow AVX to override SSSE3, it's a little faster */ + if (avx_usable()) + sha1_transform_asm = sha1_transform_avx; +#endif + + if (sha1_transform_asm) { + pr_info("Using %s optimized SHA-1 implementation\n", + sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" + : "AVX"); + ret = crypto_register_shash(&alg); + if (ret) + return ret; + + if (sha1_transform_asm == sha1_transform_ssse3) + sha_batch_transform = sha1_batch_ssse3; +#ifdef SHA1_ENABLE_AVX_SUPPORT + else + sha_batch_transform = sha1_batch_avx; +#endif + return 0; + } + pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + + return -ENODEV; +} + +module_init(sha1_ssse3_mod_init); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); + +MODULE_ALIAS("sha1"); diff -uprN linux-2.6.32/arch/x86/ia32/ia32_aout.c linux-2.6.32.ovz/arch/x86/ia32/ia32_aout.c --- linux-2.6.32/arch/x86/ia32/ia32_aout.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/ia32/ia32_aout.c 2015-03-10 13:22:46.000000000 -0700 @@ -308,14 +308,16 @@ static int load_aout_binary(struct linux if (retval) return retval; - regs->cs = __USER32_CS; - regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = - regs->r13 = regs->r14 = regs->r15 = 0; - /* OK, This is the point of no return */ set_personality(PER_LINUX); set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_ABI_PENDING); + set_bit(MMF_COMPAT, ¤t->mm->flags); + + setup_new_exec(bprm); + + regs->cs = __USER32_CS; + regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = + regs->r13 = regs->r14 = regs->r15 = 0; current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); diff -uprN linux-2.6.32/arch/x86/ia32/ia32entry.S linux-2.6.32.ovz/arch/x86/ia32/ia32entry.S --- linux-2.6.32/arch/x86/ia32/ia32entry.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/ia32/ia32entry.S 2015-06-23 04:16:16.000000000 -0700 @@ -14,6 +14,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -50,7 +51,12 @@ /* * Reload arg registers from stack in case ptrace changed them. * We don't reload %eax because syscall_trace_enter() returned - * the value it wants us to use in the table lookup. + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. */ .macro LOAD_ARGS32 offset, _r9=0 .if \_r9 @@ -60,6 +66,7 @@ movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi movl \offset+64(%rsp),%edi + movl %eax,%eax /* zero extension */ .endm .macro CFI_STARTPROC32 simple @@ -153,7 +160,7 @@ ENTRY(ia32_sysenter_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys sysenter_do_call: IA32_ARG_FIXUP @@ -195,7 +202,7 @@ sysexit_from_sys_call: movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ call audit_syscall_entry movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys movl %ebx,%edi /* reload 1st syscall arg */ movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ @@ -208,17 +215,18 @@ sysexit_from_sys_call: testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) jnz ia32_ret_from_sys_call TRACE_IRQS_ON - sti + ENABLE_INTERRUPTS(CLBR_NONE) movl %eax,%esi /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit GET_THREAD_INFO(%r10) - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - cli + DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl %edi,TI_flags(%r10) jz \exit @@ -248,7 +256,7 @@ sysenter_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call CFI_ENDPROC @@ -314,7 +322,7 @@ ENTRY(ia32_cstar_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys - cmpl $IA32_NR_syscalls-1,%eax + cmpq $IA32_NR_syscalls-1,%rax ja ia32_badsys cstar_do_call: IA32_ARG_FIXUP 1 @@ -367,7 +375,7 @@ cstar_tracesys: LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call END(ia32_cstar_target) @@ -425,7 +433,7 @@ ENTRY(ia32_syscall) orl $TS_COMPAT,TI_status(%r10) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys ia32_do_call: IA32_ARG_FIXUP @@ -444,7 +452,7 @@ ia32_tracesys: call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - cmpl $(IA32_NR_syscalls-1),%eax + cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call END(ia32_syscall) @@ -617,7 +625,7 @@ ia32_sys_call_table: .quad stub32_iopl /* 110 */ .quad sys_vhangup .quad quiet_ni_syscall /* old "idle" system call */ - .quad sys32_vm86_warning /* vm86old */ + .quad quiet_ni_syscall /* vm86old */ .quad compat_sys_wait4 .quad sys_swapoff /* 115 */ .quad compat_sys_sysinfo @@ -670,7 +678,7 @@ ia32_sys_call_table: .quad sys_mremap .quad sys_setresuid16 .quad sys_getresuid16 /* 165 */ - .quad sys32_vm86_warning /* vm86 */ + .quad quiet_ni_syscall /* vm86 */ .quad quiet_ni_syscall /* query_module */ .quad sys_poll .quad compat_sys_nfsservctl @@ -696,7 +704,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* streams2 */ .quad stub32_vfork /* 190 */ .quad compat_sys_getrlimit - .quad sys32_mmap2 + .quad sys_mmap_pgoff .quad sys32_truncate64 .quad sys32_ftruncate64 .quad sys32_stat64 /* 195 */ @@ -841,4 +849,37 @@ ia32_sys_call_table: .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ .quad sys_perf_event_open + .quad compat_sys_recvmmsg + .quad quiet_ni_syscall /* sys_fanotify_init */ + .quad quiet_ni_syscall /* sys32_fanotify_mark */ + .quad quiet_ni_syscall /* sys_prlimit64 340 */ + .quad sys_name_to_handle_at + .quad compat_sys_open_by_handle_at + .quad compat_sys_clock_adjtime + .quad sys_syncfs + .quad compat_sys_sendmmsg /* 345 */ + .quad sys_setns /* setns */ + .quad compat_sys_process_vm_readv + .quad compat_sys_process_vm_writev + .rept 500-(.-ia32_sys_call_table)/8 + .quad sys_ni_syscall + .endr + .quad sys_fairsched_mknod /* 500 */ + .quad sys_fairsched_rmnod + .quad sys_fairsched_chwt + .quad sys_fairsched_mvpr + .quad sys_fairsched_rate + .quad sys_fairsched_vcpus /* 505 */ + .quad sys_fairsched_cpumask + .quad sys_fairsched_nodemask + .quad sys_ni_syscall + .quad sys_ni_syscall + .quad sys_getluid /* 510 */ + .quad sys_setluid + .quad compat_sys_setublimit + .quad compat_sys_ubstat + .quad sys_ni_syscall + .quad sys_ni_syscall /* 515 */ + .quad sys_lchmod + .quad compat_sys_lutime ia32_syscall_end: diff -uprN linux-2.6.32/arch/x86/ia32/sys_ia32.c linux-2.6.32.ovz/arch/x86/ia32/sys_ia32.c --- linux-2.6.32/arch/x86/ia32/sys_ia32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/ia32/sys_ia32.c 2015-06-23 04:16:16.000000000 -0700 @@ -155,9 +155,6 @@ struct mmap_arg_struct { asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) { struct mmap_arg_struct a; - struct file *file = NULL; - unsigned long retval; - struct mm_struct *mm ; if (copy_from_user(&a, arg, sizeof(a))) return -EFAULT; @@ -165,22 +162,8 @@ asmlinkage long sys32_mmap(struct mmap_a if (a.offset & ~PAGE_MASK) return -EINVAL; - if (!(a.flags & MAP_ANONYMOUS)) { - file = fget(a.fd); - if (!file) - return -EBADF; - } - - mm = current->mm; - down_write(&mm->mmap_sem); - retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags, + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset>>PAGE_SHIFT); - if (file) - fput(file); - - up_write(&mm->mmap_sem); - - return retval; } asmlinkage long sys32_mprotect(unsigned long start, size_t len, @@ -539,30 +522,6 @@ asmlinkage long sys32_sendfile(int out_f return ret; } -asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - struct mm_struct *mm = current->mm; - unsigned long error; - struct file *file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - return -EBADF; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); - return error; -} - asmlinkage long sys32_olduname(struct oldold_utsname __user *name) { char *arch = "x86_64"; @@ -619,13 +578,13 @@ asmlinkage long sys32_execve(char __user compat_uptr_t __user *envp, struct pt_regs *regs) { long error; - char *filename; + struct filename *filename; filename = getname(name); error = PTR_ERR(filename); if (IS_ERR(filename)) return error; - error = compat_do_execve(filename, argv, envp, regs); + error = compat_do_execve(filename->name, argv, envp, regs); putname(filename); return error; } @@ -664,20 +623,6 @@ long sys32_fadvise64_64(int fd, __u32 of advice); } -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO - "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - long sys32_lookup_dcookie(u32 addr_low, u32 addr_high, char __user *buf, size_t len) { diff -uprN linux-2.6.32/arch/x86/include/asm/acpi.h linux-2.6.32.ovz/arch/x86/include/asm/acpi.h --- linux-2.6.32/arch/x86/include/asm/acpi.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/acpi.h 2015-03-10 13:24:07.000000000 -0700 @@ -89,6 +89,7 @@ extern int acpi_ht; extern int acpi_pci_disabled; extern int acpi_skip_timer_override; extern int acpi_use_timer_override; +extern int acpi_disable_cmcff; extern u8 acpi_sci_flags; extern int acpi_sci_override_gsi; @@ -146,6 +147,7 @@ static inline unsigned int acpi_processo #define acpi_lapic 0 #define acpi_ioapic 0 +#define acpi_disable_cmcff 0 static inline void acpi_noirq_set(void) { } static inline void acpi_disable_pci(void) { } static inline void disable_acpi(void) { } diff -uprN linux-2.6.32/arch/x86/include/asm/alternative-asm.h linux-2.6.32.ovz/arch/x86/include/asm/alternative-asm.h --- linux-2.6.32/arch/x86/include/asm/alternative-asm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/alternative-asm.h 2015-03-10 13:22:35.000000000 -0700 @@ -19,4 +19,13 @@ .endm #endif +.macro altinstruction_entry orig alt feature orig_len alt_len + .align 8 + .quad \orig + .quad \alt + .word \feature + .byte \orig_len + .byte \alt_len +.endm + #endif /* __ASSEMBLY__ */ diff -uprN linux-2.6.32/arch/x86/include/asm/alternative.h linux-2.6.32.ovz/arch/x86/include/asm/alternative.h --- linux-2.6.32/arch/x86/include/asm/alternative.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/alternative.h 2015-03-10 13:22:45.000000000 -0700 @@ -45,17 +45,17 @@ struct alt_instr { u8 *instr; /* original instruction */ u8 *replacement; - u8 cpuid; /* cpuid bit set for replacement */ + u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ u8 replacementlen; /* length of new instruction, <= instrlen */ - u8 pad1; #ifdef CONFIG_X86_64 u32 pad2; #endif }; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, + int fixup); struct module; @@ -65,12 +65,17 @@ extern void alternatives_smp_module_add( void *text, void *text_end); extern void alternatives_smp_module_del(struct module *mod); extern void alternatives_smp_switch(int smp); +extern int alternatives_text_reserved(void *start, void *end); #else static inline void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, void *text, void *text_end) {} static inline void alternatives_smp_module_del(struct module *mod) {} static inline void alternatives_smp_switch(int smp) {} +static inline int alternatives_text_reserved(void *start, void *end) +{ + return 0; +} #endif /* CONFIG_SMP */ /* alternative assembly primitive: */ @@ -81,10 +86,13 @@ static inline void alternatives_smp_swit _ASM_ALIGN "\n" \ _ASM_PTR "661b\n" /* label */ \ _ASM_PTR "663f\n" /* new instruction */ \ - " .byte " __stringify(feature) "\n" /* feature bit */ \ + " .word " __stringify(feature) "\n" /* feature bit */ \ " .byte 662b-661b\n" /* sourcelen */ \ " .byte 664f-663f\n" /* replacementlen */ \ ".previous\n" \ + ".section .discard,\"aw\",@progbits\n" \ + " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \ + ".previous\n" \ ".section .altinstr_replacement, \"ax\"\n" \ "663:\n\t" newinstr "\n664:\n" /* replacement */ \ ".previous" @@ -124,11 +132,16 @@ static inline void alternatives_smp_swit asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ : output : "i" (0), ## input) +/* Like alternative_io, but for replacing a direct call with another one. */ +#define alternative_call(oldfunc, newfunc, feature, output, input...) \ + asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \ + : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input) + /* * use this macro(s) if you need more than one output parameter * in alternative_io */ -#define ASM_OUTPUT2(a, b) a, b +#define ASM_OUTPUT2(a...) a struct paravirt_patch_site; #ifdef CONFIG_PARAVIRT @@ -154,10 +167,12 @@ static inline void apply_paravirt(struct * invalid instruction possible) or if the instructions are changed from a * consistent state to another consistent state atomically. * More care must be taken when modifying code in the SMP case because of - * Intel's errata. + * Intel's errata. text_poke_smp() takes care that errata, but still + * doesn't support NMI/MCE handler code modifying. * On the local CPU you need to be protected again NMI or MCE handlers seeing an * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_smp(void *addr, const void *opcode, size_t len); #endif /* _ASM_X86_ALTERNATIVE_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/amd_iommu.h linux-2.6.32.ovz/arch/x86/include/asm/amd_iommu.h --- linux-2.6.32/arch/x86/include/asm/amd_iommu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/amd_iommu.h 2015-03-10 13:21:15.000000000 -0700 @@ -32,6 +32,8 @@ extern void amd_iommu_flush_all_domains( extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_shutdown(void); extern void amd_iommu_apply_erratum_63(u16 devid); +extern void amd_iommu_init_api(void); +extern void amd_iommu_init_notifier(void); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } diff -uprN linux-2.6.32/arch/x86/include/asm/amd_iommu_types.h linux-2.6.32.ovz/arch/x86/include/asm/amd_iommu_types.h --- linux-2.6.32/arch/x86/include/asm/amd_iommu_types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/amd_iommu_types.h 2015-03-10 13:24:05.000000000 -0700 @@ -135,6 +135,7 @@ /* constants to configure the command buffer */ #define CMD_BUFFER_SIZE 8192 +#define CMD_BUFFER_UNINITIALIZED 1 #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) @@ -167,6 +168,34 @@ (~((1ULL << (12 + ((lvl) * 9))) - 1))) #define PM_ALIGNED(lvl, addr) ((PM_MAP_MASK(lvl) & (addr)) == (addr)) +/* + * Returns the page table level to use for a given page size + * Pagesize is expected to be a power-of-two + */ +#define PAGE_SIZE_LEVEL(pagesize) \ + ((__ffs(pagesize) - 12) / 9) +/* + * Returns the number of ptes to use for a given page size + * Pagesize is expected to be a power-of-two + */ +#define PAGE_SIZE_PTE_COUNT(pagesize) \ + (1ULL << ((__ffs(pagesize) - 12) % 9)) + +/* + * Aligns a given io-virtual address to a given page size + * Pagesize is expected to be a power-of-two + */ +#define PAGE_SIZE_ALIGN(address, pagesize) \ + ((address) & ~((pagesize) - 1)) +/* + * Creates an IOMMU PTE for an address an a given pagesize + * The PTE has no permission bits set + * Pagesize is expected to be a power-of-two larger than 4096 + */ +#define PAGE_SIZE_PTE(address, pagesize) \ + (((address) | ((pagesize) - 1)) & \ + (~(pagesize >> 1)) & PM_ADDR_MASK) + #define IOMMU_PTE_P (1ULL << 0) #define IOMMU_PTE_TV (1ULL << 1) #define IOMMU_PTE_U (1ULL << 59) @@ -297,6 +326,9 @@ struct amd_iommu { /* Pointer to PCI device of this IOMMU */ struct pci_dev *dev; + /* Cache pdev to root device for resume quirks */ + struct pci_dev *root_pdev; + /* physical address of MMIO space */ u64 mmio_phys; /* virtual address of MMIO space */ @@ -305,6 +337,9 @@ struct amd_iommu { /* capabilities of that IOMMU read from ACPI */ u32 cap; + /* flags read from acpi table */ + u8 acpi_flags; + /* * Capability pointer. There could be more than one IOMMU per PCI * device function if there are more than one AMD IOMMU capability @@ -348,6 +383,24 @@ struct amd_iommu { /* default dma_ops domain for that IOMMU */ struct dma_ops_domain *default_dom; + + /* + * We can't rely on the BIOS to restore all values on reinit, so we + * need to stash them + */ + + /* The iommu BAR */ + u32 stored_addr_lo; + u32 stored_addr_hi; + + /* + * Each iommu has 6 l1s, each of which is documented as having 0x12 + * registers + */ + u32 stored_l1[6][0x12]; + + /* The l2 indirect registers */ + u32 stored_l2[0x83]; }; /* @@ -466,6 +519,12 @@ static inline void amd_iommu_stats_init( #endif /* CONFIG_AMD_IOMMU_STATS */ +static inline bool is_rd890_iommu(struct pci_dev *pdev) +{ + return (pdev->vendor == PCI_VENDOR_ID_ATI) && + (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); +} + /* some function prototypes */ extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); diff -uprN linux-2.6.32/arch/x86/include/asm/amd_nb.h linux-2.6.32.ovz/arch/x86/include/asm/amd_nb.h --- linux-2.6.32/arch/x86/include/asm/amd_nb.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/amd_nb.h 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,66 @@ +#ifndef _ASM_X86_AMD_NB_H +#define _ASM_X86_AMD_NB_H + +#include +#include + +struct amd_nb_bus_dev_range { + u8 bus; + u8 dev_base; + u8 dev_limit; +}; + +extern const struct pci_device_id amd_nb_misc_ids[]; +extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; + +extern bool early_is_amd_nb(u32 value); +extern struct resource *amd_get_mmconfig_range(struct resource *res); +extern int amd_cache_northbridges(void); +extern void amd_flush_garts(void); +extern int amd_scan_nodes(unsigned long start, unsigned long end); +extern int amd_get_subcaches(int); +extern int amd_set_subcaches(int, int); + +struct amd_northbridge { + struct pci_dev *misc; + struct pci_dev *link; +}; + +struct amd_northbridge_info { + u16 num; + u64 flags; + struct amd_northbridge *nb; +}; +extern struct amd_northbridge_info amd_northbridges; + +#define AMD_NB_GART BIT(0) +#define AMD_NB_L3_INDEX_DISABLE BIT(1) +#define AMD_NB_L3_PARTITIONING BIT(2) + +#ifdef CONFIG_AMD_NB + +static inline u16 amd_nb_num(void) +{ + return amd_northbridges.num; +} + +static inline bool amd_nb_has_feature(unsigned feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} + +static inline struct amd_northbridge *node_to_amd_nb(int node) +{ + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; +} + +#else + +#define amd_nb_num(x) 0 +#define amd_nb_has_feature(x) false +#define node_to_amd_nb(x) NULL + +#endif + + +#endif /* _ASM_X86_AMD_NB_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/apicdef.h linux-2.6.32.ovz/arch/x86/include/asm/apicdef.h --- linux-2.6.32/arch/x86/include/asm/apicdef.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/apicdef.h 2015-03-10 13:23:26.000000000 -0700 @@ -11,6 +11,12 @@ #define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 #define APIC_DEFAULT_PHYS_BASE 0xfee00000 +/* + * This is the IO-APIC register space as specified + * by Intel docs: + */ +#define IO_APIC_SLOT_SIZE 1024 + #define APIC_ID 0x20 #define APIC_LVR 0x30 @@ -31,7 +37,7 @@ #define APIC_ARBPRI_MASK 0xFFu #define APIC_PROCPRI 0xA0 #define APIC_EOI 0xB0 -#define APIC_EIO_ACK 0x0 +#define APIC_EOI_ACK 0x0 /* Docs say 0 for future compat. */ #define APIC_RRR 0xC0 #define APIC_LDR 0xD0 #define APIC_LDR_MASK (0xFFu << 24) @@ -72,6 +78,7 @@ #define APIC_DEST_LOGICAL 0x00800 #define APIC_DEST_PHYSICAL 0x00000 #define APIC_DM_FIXED 0x00000 +#define APIC_DM_FIXED_MASK 0x00700 #define APIC_DM_LOWEST 0x00100 #define APIC_DM_SMI 0x00200 #define APIC_DM_REMRD 0x00300 @@ -93,7 +100,9 @@ #define APIC_TIMER_BASE_CLKIN 0x0 #define APIC_TIMER_BASE_TMBASE 0x1 #define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_ONESHOT (0 << 17) #define APIC_LVT_TIMER_PERIODIC (1 << 17) +#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) #define APIC_LVT_MASKED (1 << 16) #define APIC_LVT_LEVEL_TRIGGER (1 << 15) #define APIC_LVT_REMOTE_IRR (1 << 14) @@ -125,6 +134,7 @@ #define APIC_EILVTn(n) (0x500 + 0x10 * n) #define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ #define APIC_EILVT_NR_AMD_10H 4 +#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H #define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) #define APIC_EILVT_MSG_FIX 0x0 #define APIC_EILVT_MSG_SMI 0x2 @@ -138,6 +148,7 @@ #ifdef CONFIG_X86_32 # define MAX_IO_APICS 64 +# define MAX_LOCAL_APIC 256 #else # define MAX_IO_APICS 128 # define MAX_LOCAL_APIC 32768 diff -uprN linux-2.6.32/arch/x86/include/asm/apic.h linux-2.6.32.ovz/arch/x86/include/asm/apic.h --- linux-2.6.32/arch/x86/include/asm/apic.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/apic.h 2015-03-10 13:24:12.000000000 -0700 @@ -14,6 +14,7 @@ #include #include #include +#include #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -50,6 +51,7 @@ extern unsigned int apic_verbosity; extern int local_apic_timer_c2_ok; extern int disable_apic; +extern unsigned int lapic_timer_frequency; #ifdef CONFIG_SMP extern void __inquire_remote_apic(int apicid); @@ -139,6 +141,11 @@ static inline void native_apic_msr_write wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); } +static inline void native_apic_msr_eoi_write(u32 reg, u32 v) +{ + wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); +} + static inline u32 native_apic_msr_read(u32 reg) { u32 low, high; @@ -252,9 +259,7 @@ static inline int apic_is_clustered_box( } #endif -extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); -extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); - +extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); #else /* !CONFIG_X86_LOCAL_APIC */ static inline void lapic_shutdown(void) { } @@ -356,6 +361,16 @@ struct apic { void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); u32 (*safe_wait_icr_idle)(void); +#ifndef __GENKSYMS__ + /* + * ->eoi_write() has the same signature as ->write(). + * + * Drivers can support both ->eoi_write() and ->write() by passing the same + * callback value. Kernel can override ->eoi_write() and fall back + * on write for EOI. + */ + void (*eoi_write)(u32 reg, u32 v); +#endif }; /* @@ -383,6 +398,11 @@ static inline void apic_write(u32 reg, u apic->write(reg, val); } +static inline void apic_eoi(void) +{ + apic->eoi_write(APIC_EOI, APIC_EOI_ACK); +} + static inline u64 apic_icr_read(void) { return apic->icr_read(); @@ -403,6 +423,12 @@ static inline u32 safe_apic_wait_icr_idl return apic->safe_wait_icr_idle(); } +#ifdef CONFIG_X86_LOCAL_APIC +extern struct apic *apic_probe[]; +extern void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)); +#else +static inline void apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) {} +#endif static inline void ack_APIC_irq(void) { @@ -411,9 +437,7 @@ static inline void ack_APIC_irq(void) * ack_APIC_irq() actually gets compiled as a single instruction * ... yummie. */ - - /* Docs say use 0 for future compatibility */ - apic_write(APIC_EOI, 0); + apic_eoi(); #endif } @@ -589,6 +613,32 @@ static inline physid_mask_t default_apic } #endif /* CONFIG_X86_LOCAL_APIC */ +extern void irq_enter(void); +extern void irq_exit(void); + +static inline void entering_irq(void) +{ + exit_idle(); + irq_enter(); +} + +static inline void entering_ack_irq(void) +{ + ack_APIC_irq(); + entering_irq(); +} + +static inline void exiting_irq(void) +{ + irq_exit(); +} + +static inline void exiting_ack_irq(void) +{ + irq_exit(); + /* Ack only at the end to avoid potential reentry */ + ack_APIC_irq(); +} #ifdef CONFIG_X86_32 extern u8 cpu_2_logical_apicid[NR_CPUS]; diff -uprN linux-2.6.32/arch/x86/include/asm/archrandom.h linux-2.6.32.ovz/arch/x86/include/asm/archrandom.h --- linux-2.6.32/arch/x86/include/asm/archrandom.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/archrandom.h 2015-03-10 13:23:10.000000000 -0700 @@ -0,0 +1,75 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu , + * H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#ifndef ASM_X86_ARCHRANDOM_H +#define ASM_X86_ARCHRANDOM_H + +#include +#include +#include +#include + +#define RDRAND_RETRY_LOOPS 10 + +#define RDRAND_INT ".byte 0x0f,0xc7,0xf0" +#ifdef CONFIG_X86_64 +# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" +#else +# define RDRAND_LONG RDRAND_INT +#endif + +#ifdef CONFIG_ARCH_RANDOM + +#define GET_RANDOM(name, type, rdrand, nop) \ +static inline int name(type *v) \ +{ \ + int ok; \ + alternative_io("movl $0, %0\n\t" \ + nop, \ + "\n1: " rdrand "\n\t" \ + "jc 2f\n\t" \ + "decl %0\n\t" \ + "jnz 1b\n\t" \ + "2:", \ + X86_FEATURE_RDRAND, \ + ASM_OUTPUT2("=r" (ok), "=a" (*v)), \ + "0" (RDRAND_RETRY_LOOPS)); \ + return ok; \ +} + +#ifdef CONFIG_X86_64 + +GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); +GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); + +#else + +GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); +GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); + +#endif /* CONFIG_X86_64 */ + +#endif /* CONFIG_ARCH_RANDOM */ + +extern void x86_init_rdrand(struct cpuinfo_x86 *c); + +#endif /* ASM_X86_ARCHRANDOM_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/bitops.h linux-2.6.32.ovz/arch/x86/include/asm/bitops.h --- linux-2.6.32/arch/x86/include/asm/bitops.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/bitops.h 2015-03-10 13:23:23.000000000 -0700 @@ -15,6 +15,8 @@ #include #include +#define BIT_64(n) (U64_C(1) << (n)) + /* * These have to be done with inline assembly: that way the bit-setting * is guaranteed to be atomic. All bit operations return 0 if the bit @@ -262,6 +264,13 @@ static inline int test_and_clear_bit(int * This operation is non-atomic and can be reordered. * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. + * + * Note: the operation is performed atomically with respect to + * the local CPU, but not other CPUs. Portable code should not + * rely on this behaviour. + * KVM relies on this behaviour on x86 for modifying memory that is also + * accessed from a hypervisor on the same CPU if running in a VM: don't change + * this without also updating arch/x86/kernel/kvm.c */ static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) { @@ -306,6 +315,44 @@ static inline int test_and_change_bit(in return oldbit; } +#ifdef CONFIG_X86_64 +#define INT_MAX ((int)(~0U>>1)) +#define LONG_INT_MAX ((long)INT_MAX) +#define INT_MAX_PLUS_1 (LONG_INT_MAX + 1UL) + +static inline int test_and_set_bit_long(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long oldbit; + + if (unlikely(nr >= INT_MAX_PLUS_1)) { + addr += (nr / INT_MAX_PLUS_1) * (INT_MAX_PLUS_1 / BITS_PER_LONG); + nr &= INT_MAX; + } + asm volatile(LOCK_PREFIX "bts %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +static inline int test_and_clear_bit_long(unsigned long nr, volatile unsigned long *addr) +{ + unsigned long oldbit; + + if (unlikely (nr > LONG_INT_MAX)) { + while (nr > LONG_INT_MAX) { + nr -= LONG_INT_MAX + 1; + addr += ((LONG_INT_MAX + 1)/BITS_PER_LONG); + } + } + asm volatile(LOCK_PREFIX "btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} +#endif + static __always_inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) { return ((1UL << (nr % BITS_PER_LONG)) & diff -uprN linux-2.6.32/arch/x86/include/asm/cacheflush.h linux-2.6.32.ovz/arch/x86/include/asm/cacheflush.h --- linux-2.6.32/arch/x86/include/asm/cacheflush.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/cacheflush.h 2015-03-10 13:21:57.000000000 -0700 @@ -43,9 +43,6 @@ static inline void copy_from_user_page(s memcpy(dst, src, len); } -#define PG_WC PG_arch_1 -PAGEFLAG(WC, WC) - #ifdef CONFIG_X86_PAT /* * X86 PAT uses page flags WC and Uncached together to keep track of @@ -54,16 +51,24 @@ PAGEFLAG(WC, WC) * _PAGE_CACHE_UC_MINUS and fourth state where page's memory type has not * been changed from its default (value of -1 used to denote this). * Note we do not support _PAGE_CACHE_UC here. - * - * Caller must hold memtype_lock for atomicity. */ + +#define _PGMT_DEFAULT 0 +#define _PGMT_WC (1UL << PG_arch_1) +#define _PGMT_UC_MINUS (1UL << PG_uncached) +#define _PGMT_WB (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_MASK (1UL << PG_uncached | 1UL << PG_arch_1) +#define _PGMT_CLEAR_MASK (~_PGMT_MASK) + static inline unsigned long get_page_memtype(struct page *pg) { - if (!PageUncached(pg) && !PageWC(pg)) + unsigned long pg_flags = pg->flags & _PGMT_MASK; + + if (pg_flags == _PGMT_DEFAULT) return -1; - else if (!PageUncached(pg) && PageWC(pg)) + else if (pg_flags == _PGMT_WC) return _PAGE_CACHE_WC; - else if (PageUncached(pg) && !PageWC(pg)) + else if (pg_flags == _PGMT_UC_MINUS) return _PAGE_CACHE_UC_MINUS; else return _PAGE_CACHE_WB; @@ -71,25 +76,26 @@ static inline unsigned long get_page_mem static inline void set_page_memtype(struct page *pg, unsigned long memtype) { + unsigned long memtype_flags = _PGMT_DEFAULT; + unsigned long old_flags; + unsigned long new_flags; + switch (memtype) { case _PAGE_CACHE_WC: - ClearPageUncached(pg); - SetPageWC(pg); + memtype_flags = _PGMT_WC; break; case _PAGE_CACHE_UC_MINUS: - SetPageUncached(pg); - ClearPageWC(pg); + memtype_flags = _PGMT_UC_MINUS; break; case _PAGE_CACHE_WB: - SetPageUncached(pg); - SetPageWC(pg); - break; - default: - case -1: - ClearPageUncached(pg); - ClearPageWC(pg); + memtype_flags = _PGMT_WB; break; } + + do { + old_flags = pg->flags; + new_flags = (old_flags & _PGMT_CLEAR_MASK) | memtype_flags; + } while (cmpxchg(&pg->flags, old_flags, new_flags) != old_flags); } #else static inline unsigned long get_page_memtype(struct page *pg) { return -1; } @@ -138,9 +144,11 @@ int set_memory_np(unsigned long addr, in int set_memory_4k(unsigned long addr, int numpages); int set_memory_array_uc(unsigned long *addr, int addrinarray); +int set_memory_array_wc(unsigned long *addr, int addrinarray); int set_memory_array_wb(unsigned long *addr, int addrinarray); int set_pages_array_uc(struct page **pages, int addrinarray); +int set_pages_array_wc(struct page **pages, int addrinarray); int set_pages_array_wb(struct page **pages, int addrinarray); /* @@ -176,6 +184,7 @@ void clflush_cache_range(void *addr, uns #ifdef CONFIG_DEBUG_RODATA void mark_rodata_ro(void); extern const int rodata_test_data; +extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); #else diff -uprN linux-2.6.32/arch/x86/include/asm/compat.h linux-2.6.32.ovz/arch/x86/include/asm/compat.h --- linux-2.6.32/arch/x86/include/asm/compat.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/compat.h 2015-06-23 04:16:16.000000000 -0700 @@ -107,7 +107,8 @@ struct compat_statfs { compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; #define COMPAT_RLIM_OLD_INFINITY 0x7fffffff @@ -204,7 +205,7 @@ static inline compat_uptr_t ptr_to_compa return (u32)(unsigned long)uptr; } -static inline void __user *compat_alloc_user_space(long len) +static inline void __user *arch_compat_alloc_user_space(long len) { struct pt_regs *regs = task_pt_regs(current); return (void __user *)regs->sp - len; diff -uprN linux-2.6.32/arch/x86/include/asm/cpu_device_id.h linux-2.6.32.ovz/arch/x86/include/asm/cpu_device_id.h --- linux-2.6.32/arch/x86/include/asm/cpu_device_id.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/cpu_device_id.h 2015-03-10 13:24:00.000000000 -0700 @@ -0,0 +1,13 @@ +#ifndef _CPU_DEVICE_ID +#define _CPU_DEVICE_ID 1 + +/* + * Declare drivers belonging to specific x86 CPUs + * Similar in spirit to pci_device_id and related PCI functions + */ + +#include + +extern const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match); + +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/cpufeature.h linux-2.6.32.ovz/arch/x86/include/asm/cpufeature.h --- linux-2.6.32/arch/x86/include/asm/cpufeature.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/cpufeature.h 2015-06-23 04:16:16.000000000 -0700 @@ -7,6 +7,12 @@ #include #define NCAPINTS 9 /* N 32-bit words worth of info */ +/* + * KABI prevents us from extending NCAPINTS. Instead use RH_EXT_NCAPINTS and + * extend the array in the non-whitelisted cpuinfo_x86_rh structure. + */ +#define RH_EXT_NCAPINTS 2 +#define RHNCAPINTS (NCAPINTS + RH_EXT_NCAPINTS) /* * Note: If the comment begins with a quoted string, that string is used @@ -97,6 +103,8 @@ #define X86_FEATURE_EXTD_APICID (3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM (3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */ +#define X86_FEATURE_UNFAIR_SPINLOCK (3*32+29) /* use unfair spinlocks */ +#define X86_FEATURE_CPUID_FAULTING (3*32+30) /* cpuid faulting */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */ @@ -114,16 +122,20 @@ #define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ #define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */ #define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ #define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ #define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */ #define X86_FEATURE_AES (4*32+25) /* AES instructions */ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ #define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ +#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */ #define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ @@ -150,33 +162,103 @@ #define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ #define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ #define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ +#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */ #define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ #define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE (6*32+17) /* translation cache extension */ +#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */ +#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB (6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_PERFCTR_L2 (6*32+28) /* L2 performance counter extensions */ /* * Auxiliary flags: Linux defined - For features scattered in various - * CPUID levels like 0x6, 0xA etc + * CPUID levels like 0x6, 0xA etc, word 7 */ #define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */ #define X86_FEATURE_ARAT (7*32+ 1) /* Always Running APIC Timer */ +#define X86_FEATURE_CPB (7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB (7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_PLN (7*32+ 5) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (7*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_DTHERM (7*32+ 7) /* Digital Thermal Sensor */ -/* Virtualization flags: Linux defined */ +/* Virtualization flags: Linux defined, word 8 */ #define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */ #define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ +#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */ +#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */ +#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */ +#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */ + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ +#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ +#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */ +#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */ +#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */ + +/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ #if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#include #include -extern const char * const x86_cap_flags[NCAPINTS*32]; +extern const char * const x86_cap_flags[RHNCAPINTS*32]; extern const char * const x86_power_flags[32]; -#define test_cpu_cap(c, bit) \ - test_bit(bit, (unsigned long *)((c)->x86_capability)) + /* start of RH extended capabilities word 9, bit 0 */ +#define RH_EXT_CAP_BIT0 (9*32+0) + +/* + * RHEL: If the bit is in the cpuinfo_x86 struct (ie, is word 8 or less), + * then just use the bit like it always has been. If it is in the new + * extended struct, cpuinfo_x86_rh, then check to see if the cpuinfo_x86 + * struct was the boot_cpu_data struct. If it was the boot_cpu_data + * struct then use boot_cpu_data_rh, o/w use the cpu_data_rh per_cpu data + */ +#define _cpu_cap_fn(func, c, bit) \ + ((bit < RH_EXT_CAP_BIT0) ? \ + func##_bit(bit, (unsigned long *)((c)->x86_capability)) : \ + ((c == &boot_cpu_data) ? \ + func##_bit(bit - RH_EXT_CAP_BIT0, \ + (unsigned long *)boot_cpu_data_rh.x86_capability): \ + func##_bit(bit - RH_EXT_CAP_BIT0, \ + (unsigned long *)cpu_data_rh((c)->cpu_index).x86_capability)\ + ) \ + ) + +#define test_cpu_cap(c, bit) _cpu_cap_fn(test, c, bit) +#define set_cpu_cap(c, bit) _cpu_cap_fn(set, c, bit) +#define clear_cpu_cap(c, bit) _cpu_cap_fn(clear, c, bit) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && \ @@ -187,14 +269,17 @@ extern const char * const x86_power_flag (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \ (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \ (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ - (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \ + (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ + (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ + (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ ? 1 : \ test_cpu_cap(c, bit)) +#define this_cpu_has(bit) \ + cpu_has(¤t_cpu_data, bit) + #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) -#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) -#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) #define setup_clear_cpu_cap(bit) do { \ clear_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_cleared); \ @@ -219,7 +304,9 @@ extern const char * const x86_power_flag #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) #define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) #define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) +#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) +#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) @@ -247,7 +334,16 @@ extern const char * const x86_power_flag #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT) +#define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) +#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) +#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) +#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) +#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB) +#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) +#define cpu_has_perfctr_l2 boot_cpu_has(X86_FEATURE_PERFCTR_L2) +#define cpu_has_cpuid_faulting boot_cpu_has(X86_FEATURE_CPUID_FAULTING) #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) # define cpu_has_invlpg 1 @@ -277,6 +373,64 @@ extern const char * const x86_power_flag #endif /* CONFIG_X86_64 */ +/* + * Static testing of CPU features. Used the same as boot_cpu_has(). + * These are only valid after alternatives have run, but will statically + * patch the target code for additional performance. + * + */ +static __always_inline __pure bool __static_cpu_has(u16 bit) +{ +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5) + asm goto("1: jmp %l[t_no]\n" + "2:\n" + ".section .altinstructions,\"a\"\n" + _ASM_ALIGN "\n" + _ASM_PTR "1b\n" + _ASM_PTR "0\n" /* no replacement */ + " .word %P0\n" /* feature bit */ + " .byte 2b - 1b\n" /* source len */ + " .byte 0\n" /* replacement len */ + ".previous\n" + /* skipping size check since replacement size = 0 */ + : : "i" (bit) : : t_no); + return true; + t_no: + return false; +#else + u8 flag; + /* Open-coded due to __stringify() in ALTERNATIVE() */ + asm volatile("1: movb $0,%0\n" + "2:\n" + ".section .altinstructions,\"a\"\n" + _ASM_ALIGN "\n" + _ASM_PTR "1b\n" + _ASM_PTR "3f\n" + " .word %P1\n" /* feature bit */ + " .byte 2b - 1b\n" /* source len */ + " .byte 4f - 3f\n" /* replacement len */ + ".previous\n" + ".section .discard,\"aw\",@progbits\n" + " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ + ".previous\n" + ".section .altinstr_replacement,\"ax\"\n" + "3: movb $1,%0\n" + "4:\n" + ".previous\n" + : "=qm" (flag) : "i" (bit)); + return flag; +#endif +} + +#define static_cpu_has(bit) \ +( \ + __builtin_constant_p(boot_cpu_has(bit)) ? \ + boot_cpu_has(bit) : \ + __builtin_constant_p(bit) ? \ + __static_cpu_has(bit) : \ + boot_cpu_has(bit) \ +) + #endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */ #endif /* _ASM_X86_CPUFEATURE_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/cpu.h linux-2.6.32.ovz/arch/x86/include/asm/cpu.h --- linux-2.6.32/arch/x86/include/asm/cpu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/cpu.h 2015-03-10 13:22:15.000000000 -0700 @@ -32,6 +32,8 @@ extern void arch_unregister_cpu(int); DECLARE_PER_CPU(int, cpu_state); +int mwait_usable(const struct cpuinfo_x86 *); + extern unsigned int boot_cpu_id; #endif /* _ASM_X86_CPU_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/cpumask.h linux-2.6.32.ovz/arch/x86/include/asm/cpumask.h --- linux-2.6.32/arch/x86/include/asm/cpumask.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/cpumask.h 2015-03-10 13:23:48.000000000 -0700 @@ -7,6 +7,7 @@ extern cpumask_var_t cpu_callin_mask; extern cpumask_var_t cpu_callout_mask; extern cpumask_var_t cpu_initialized_mask; extern cpumask_var_t cpu_sibling_setup_mask; +extern cpumask_var_t cpu_may_complete_boot_mask; extern void setup_cpu_local_masks(void); diff -uprN linux-2.6.32/arch/x86/include/asm/crash.h linux-2.6.32.ovz/arch/x86/include/asm/crash.h --- linux-2.6.32/arch/x86/include/asm/crash.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/crash.h 2015-03-10 13:23:21.000000000 -0700 @@ -0,0 +1,88 @@ +#ifndef _ASM_I386_CRASH_H +#define _ASM_I386_CRASH_H + +/* + * linux/include/asm-i386/crash.h + * + * Copyright (c) 2004 Red Hat, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifdef __KERNEL__ + +#include +#include +#include + +#ifdef __i386__ +#include +#include +#endif + +extern int page_is_ram(unsigned long); + +static inline void * +map_virtual(u64 offset, struct page **pp) +{ + struct page *page; + unsigned long pfn; + void *vaddr; + + pfn = (unsigned long)(offset >> PAGE_SHIFT); + + if (!page_is_ram(pfn)) { + printk(KERN_INFO + "crash memory driver: !page_is_ram(pfn: %lx)\n", pfn); + return NULL; + } + + if (!pfn_valid(pfn)) { + printk(KERN_INFO + "crash memory driver: invalid pfn: %lx )\n", pfn); + return NULL; + } + +#ifdef __i386__ + if (xen_pv_domain() && !phys_to_machine_mapping_valid(pfn)) { + printk(KERN_INFO "crash memory driver: " + "no machine mapping for pfn: %lx\n", pfn); + return NULL; + } +#endif + + page = pfn_to_page(pfn); + + vaddr = kmap(page); + if (!vaddr) { + printk(KERN_INFO + "crash memory driver: pfn: %lx kmap(page: %lx) failed\n", + pfn, (unsigned long)page); + return NULL; + } + + *pp = page; + return (vaddr + (offset & (PAGE_SIZE-1))); +} + +static inline void unmap_virtual(struct page *page) +{ + kunmap(page); +} + +#endif /* __KERNEL__ */ + +#endif /* _ASM_I386_CRASH_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/crypto/ablk_helper.h linux-2.6.32.ovz/arch/x86/include/asm/crypto/ablk_helper.h --- linux-2.6.32/arch/x86/include/asm/crypto/ablk_helper.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/crypto/ablk_helper.h 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,31 @@ +/* + * Shared async block cipher helpers + */ + +#ifndef _CRYPTO_ABLK_HELPER_H +#define _CRYPTO_ABLK_HELPER_H + +#include +#include +#include + +struct async_helper_ctx { + struct cryptd_ablkcipher *cryptd_tfm; +}; + +extern int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, + unsigned int key_len); + +extern int __ablk_encrypt(struct ablkcipher_request *req); + +extern int ablk_encrypt(struct ablkcipher_request *req); + +extern int ablk_decrypt(struct ablkcipher_request *req); + +extern void ablk_exit(struct crypto_tfm *tfm); + +extern int ablk_init_common(struct crypto_tfm *tfm, const char *drv_name); + +extern int ablk_init(struct crypto_tfm *tfm); + +#endif /* _CRYPTO_ABLK_HELPER_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/crypto/glue_helper.h linux-2.6.32.ovz/arch/x86/include/asm/crypto/glue_helper.h --- linux-2.6.32/arch/x86/include/asm/crypto/glue_helper.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/crypto/glue_helper.h 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,145 @@ +/* + * Shared glue code for 128bit block ciphers + */ + +#ifndef _CRYPTO_GLUE_HELPER_H +#define _CRYPTO_GLUE_HELPER_H + +#include +#include +#include +#include + +typedef void (*common_glue_func_t)(void *ctx, u8 *dst, const u8 *src); +typedef void (*common_glue_cbc_func_t)(void *ctx, u128 *dst, const u128 *src); +typedef void (*common_glue_ctr_func_t)(void *ctx, u128 *dst, const u128 *src, + le128 *iv); +typedef void (*common_glue_xts_func_t)(void *ctx, u128 *dst, const u128 *src, + le128 *iv); + +#define GLUE_FUNC_CAST(fn) ((common_glue_func_t)(fn)) +#define GLUE_CBC_FUNC_CAST(fn) ((common_glue_cbc_func_t)(fn)) +#define GLUE_CTR_FUNC_CAST(fn) ((common_glue_ctr_func_t)(fn)) +#define GLUE_XTS_FUNC_CAST(fn) ((common_glue_xts_func_t)(fn)) + +struct common_glue_func_entry { + unsigned int num_blocks; /* number of blocks that @fn will process */ + union { + common_glue_func_t ecb; + common_glue_cbc_func_t cbc; + common_glue_ctr_func_t ctr; + common_glue_xts_func_t xts; + } fn_u; +}; + +struct common_glue_ctx { + unsigned int num_funcs; + int fpu_blocks_limit; /* -1 means fpu not needed at all */ + + /* + * First funcs entry must have largest num_blocks and last funcs entry + * must have num_blocks == 1! + */ + struct common_glue_func_entry funcs[]; +}; + +static inline bool glue_fpu_begin(unsigned int bsize, int fpu_blocks_limit, + struct blkcipher_desc *desc, + bool fpu_enabled, unsigned int nbytes) +{ + if (likely(fpu_blocks_limit < 0)) + return false; + + if (fpu_enabled) + return true; + + /* + * Vector-registers are only used when chunk to be processed is large + * enough, so do not enable FPU until it is necessary. + */ + if (nbytes < bsize * (unsigned int)fpu_blocks_limit) + return false; + + if (desc) { + /* prevent sleeping if FPU is in use */ + desc->flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; + } + + kernel_fpu_begin(); + return true; +} + +static inline void glue_fpu_end(bool fpu_enabled) +{ + if (fpu_enabled) + kernel_fpu_end(); +} + +static inline void le128_to_be128(be128 *dst, const le128 *src) +{ + dst->a = cpu_to_be64(le64_to_cpu(src->a)); + dst->b = cpu_to_be64(le64_to_cpu(src->b)); +} + +static inline void be128_to_le128(le128 *dst, const be128 *src) +{ + dst->a = cpu_to_le64(be64_to_cpu(src->a)); + dst->b = cpu_to_le64(be64_to_cpu(src->b)); +} + +static inline void le128_inc(le128 *i) +{ + u64 a = le64_to_cpu(i->a); + u64 b = le64_to_cpu(i->b); + + b++; + if (!b) + a++; + + i->a = cpu_to_le64(a); + i->b = cpu_to_le64(b); +} + +static inline void le128_gf128mul_x_ble(le128 *dst, const le128 *src) +{ + u64 a = le64_to_cpu(src->a); + u64 b = le64_to_cpu(src->b); + u64 _tt = ((s64)a >> 63) & 0x87; + + dst->a = cpu_to_le64((a << 1) ^ (b >> 63)); + dst->b = cpu_to_le64((b << 1) ^ _tt); +} + +extern int glue_ecb_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes); + +extern int glue_cbc_encrypt_128bit(const common_glue_func_t fn, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, + unsigned int nbytes); + +extern int glue_cbc_decrypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, + unsigned int nbytes); + +extern int glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes); + +extern int glue_xts_crypt_128bit(const struct common_glue_ctx *gctx, + struct blkcipher_desc *desc, + struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes, + common_glue_func_t tweak_fn, void *tweak_ctx, + void *crypt_ctx); + +extern void glue_xts_crypt_128bit_one(void *ctx, u128 *dst, const u128 *src, + le128 *iv, common_glue_func_t fn); + +#endif /* _CRYPTO_GLUE_HELPER_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/desc.h linux-2.6.32.ovz/arch/x86/include/asm/desc.h --- linux-2.6.32/arch/x86/include/asm/desc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/desc.h 2015-03-10 13:24:02.000000000 -0700 @@ -5,6 +5,7 @@ #include #include #include +#include static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) @@ -93,6 +94,9 @@ static inline int desc_empty(const void #define load_TLS(t, cpu) native_load_tls(t, cpu) #define set_ldt native_set_ldt +#ifdef CONFIG_X86_32 +#define load_user_cs_desc native_load_user_cs_desc +#endif /*CONFIG_X86_32*/ #define write_ldt_entry(dt, entry, desc) \ native_write_ldt_entry(dt, entry, desc) @@ -309,6 +313,19 @@ static inline void set_desc_limit(struct desc->limit = (limit >> 16) & 0xf; } +#ifdef CONFIG_TRACING +extern struct desc_ptr trace_idt_descr; +extern gate_desc trace_idt_table[]; +static inline void write_trace_idt_entry(int entry, const gate_desc *gate) +{ + write_idt_entry(trace_idt_table, entry, gate); +} +#else +static inline void write_trace_idt_entry(int entry, const gate_desc *gate) +{ +} +#endif + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { @@ -319,6 +336,7 @@ static inline void _set_gate(int gate, u * setup time */ write_idt_entry(idt_table, gate, &s); + write_trace_idt_entry(gate, &s); } /* @@ -347,12 +365,39 @@ static inline void alloc_system_vector(i BUG(); } -static inline void alloc_intr_gate(unsigned int n, void *addr) +#ifdef CONFIG_TRACING +static inline void trace_set_intr_gate(unsigned int gate, void *addr) +{ + gate_desc s; + + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); + write_idt_entry(trace_idt_table, gate, &s); +} + +static inline void __trace_alloc_intr_gate(unsigned int n, void *addr) +{ + trace_set_intr_gate(n, addr); +} +#else +static inline void trace_set_intr_gate(unsigned int gate, void *addr) +{ +} + +#define __trace_alloc_intr_gate(n, addr) +#endif + +static inline void __alloc_intr_gate(unsigned int n, void *addr) { - alloc_system_vector(n); set_intr_gate(n, addr); } +#define alloc_intr_gate(n, addr) \ + do { \ + alloc_system_vector(n); \ + __alloc_intr_gate(n, addr); \ + __trace_alloc_intr_gate(n, trace_##addr); \ + } while (0) + /* * This routine sets up an interrupt gate at directory privilege level 3. */ @@ -392,4 +437,65 @@ static inline void set_system_intr_gate_ _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } +#ifdef CONFIG_X86_32 +static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) +{ + limit = (limit - 1) / PAGE_SIZE; + desc->a = limit & 0xffff; + desc->b = (limit & 0xf0000) | 0x00c0fb00; +} + +static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs; +} + +#define arch_add_exec_range arch_add_exec_range +#define arch_remove_exec_range arch_remove_exec_range +#define arch_flush_exec_range arch_flush_exec_range +extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_flush_exec_range(struct mm_struct *mm); +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_TRACING +extern atomic_t trace_idt_ctr; +static inline bool is_trace_idt_enabled(void) +{ + if (atomic_read(&trace_idt_ctr)) + return true; + + return false; +} + +static inline void load_trace_idt(void) +{ + load_idt((const struct desc_ptr *)&trace_idt_descr); +} +#else +static inline bool is_trace_idt_enabled(void) +{ + return false; +} + +static inline void load_trace_idt(void) +{ +} +#endif + + /* + * The load_current_idt() must be called with interrupts disabled + * to avoid races. That way the IDT will always be set back to the expected + * descriptor. It's also called when a CPU is being initialized, and + * that doesn't need to disable interrupts, as nothing should be + * bothering the CPU then. + */ +static inline void load_current_idt(void) +{ + if (is_trace_idt_enabled()) + load_trace_idt(); + else + load_idt((const struct desc_ptr *)&idt_descr); +} + #endif /* _ASM_X86_DESC_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/ds.h linux-2.6.32.ovz/arch/x86/include/asm/ds.h --- linux-2.6.32/arch/x86/include/asm/ds.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/ds.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,302 +0,0 @@ -/* - * Debug Store (DS) support - * - * This provides a low-level interface to the hardware's Debug Store - * feature that is used for branch trace store (BTS) and - * precise-event based sampling (PEBS). - * - * It manages: - * - DS and BTS hardware configuration - * - buffer overflow handling (to be done) - * - buffer access - * - * It does not do: - * - security checking (is the caller allowed to trace the task) - * - buffer allocation (memory accounting) - * - * - * Copyright (C) 2007-2009 Intel Corporation. - * Markus Metzger , 2007-2009 - */ - -#ifndef _ASM_X86_DS_H -#define _ASM_X86_DS_H - - -#include -#include -#include - - -#ifdef CONFIG_X86_DS - -struct task_struct; -struct ds_context; -struct ds_tracer; -struct bts_tracer; -struct pebs_tracer; - -typedef void (*bts_ovfl_callback_t)(struct bts_tracer *); -typedef void (*pebs_ovfl_callback_t)(struct pebs_tracer *); - - -/* - * A list of features plus corresponding macros to talk about them in - * the ds_request function's flags parameter. - * - * We use the enum to index an array of corresponding control bits; - * we use the macro to index a flags bit-vector. - */ -enum ds_feature { - dsf_bts = 0, - dsf_bts_kernel, -#define BTS_KERNEL (1 << dsf_bts_kernel) - /* trace kernel-mode branches */ - - dsf_bts_user, -#define BTS_USER (1 << dsf_bts_user) - /* trace user-mode branches */ - - dsf_bts_overflow, - dsf_bts_max, - dsf_pebs = dsf_bts_max, - - dsf_pebs_max, - dsf_ctl_max = dsf_pebs_max, - dsf_bts_timestamps = dsf_ctl_max, -#define BTS_TIMESTAMPS (1 << dsf_bts_timestamps) - /* add timestamps into BTS trace */ - -#define BTS_USER_FLAGS (BTS_KERNEL | BTS_USER | BTS_TIMESTAMPS) -}; - - -/* - * Request BTS or PEBS - * - * Due to alignement constraints, the actual buffer may be slightly - * smaller than the requested or provided buffer. - * - * Returns a pointer to a tracer structure on success, or - * ERR_PTR(errcode) on failure. - * - * The interrupt threshold is independent from the overflow callback - * to allow users to use their own overflow interrupt handling mechanism. - * - * The function might sleep. - * - * task: the task to request recording for - * cpu: the cpu to request recording for - * base: the base pointer for the (non-pageable) buffer; - * size: the size of the provided buffer in bytes - * ovfl: pointer to a function to be called on buffer overflow; - * NULL if cyclic buffer requested - * th: the interrupt threshold in records from the end of the buffer; - * -1 if no interrupt threshold is requested. - * flags: a bit-mask of the above flags - */ -extern struct bts_tracer *ds_request_bts_task(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, - size_t th, unsigned int flags); -extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, - bts_ovfl_callback_t ovfl, - size_t th, unsigned int flags); -extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, - size_t th, unsigned int flags); -extern struct pebs_tracer *ds_request_pebs_cpu(int cpu, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, - size_t th, unsigned int flags); - -/* - * Release BTS or PEBS resources - * Suspend and resume BTS or PEBS tracing - * - * Must be called with irq's enabled. - * - * tracer: the tracer handle returned from ds_request_~() - */ -extern void ds_release_bts(struct bts_tracer *tracer); -extern void ds_suspend_bts(struct bts_tracer *tracer); -extern void ds_resume_bts(struct bts_tracer *tracer); -extern void ds_release_pebs(struct pebs_tracer *tracer); -extern void ds_suspend_pebs(struct pebs_tracer *tracer); -extern void ds_resume_pebs(struct pebs_tracer *tracer); - -/* - * Release BTS or PEBS resources - * Suspend and resume BTS or PEBS tracing - * - * Cpu tracers must call this on the traced cpu. - * Task tracers must call ds_release_~_noirq() for themselves. - * - * May be called with irq's disabled. - * - * Returns 0 if successful; - * -EPERM if the cpu tracer does not trace the current cpu. - * -EPERM if the task tracer does not trace itself. - * - * tracer: the tracer handle returned from ds_request_~() - */ -extern int ds_release_bts_noirq(struct bts_tracer *tracer); -extern int ds_suspend_bts_noirq(struct bts_tracer *tracer); -extern int ds_resume_bts_noirq(struct bts_tracer *tracer); -extern int ds_release_pebs_noirq(struct pebs_tracer *tracer); -extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer); -extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer); - - -/* - * The raw DS buffer state as it is used for BTS and PEBS recording. - * - * This is the low-level, arch-dependent interface for working - * directly on the raw trace data. - */ -struct ds_trace { - /* the number of bts/pebs records */ - size_t n; - /* the size of a bts/pebs record in bytes */ - size_t size; - /* pointers into the raw buffer: - - to the first entry */ - void *begin; - /* - one beyond the last entry */ - void *end; - /* - one beyond the newest entry */ - void *top; - /* - the interrupt threshold */ - void *ith; - /* flags given on ds_request() */ - unsigned int flags; -}; - -/* - * An arch-independent view on branch trace data. - */ -enum bts_qualifier { - bts_invalid, -#define BTS_INVALID bts_invalid - - bts_branch, -#define BTS_BRANCH bts_branch - - bts_task_arrives, -#define BTS_TASK_ARRIVES bts_task_arrives - - bts_task_departs, -#define BTS_TASK_DEPARTS bts_task_departs - - bts_qual_bit_size = 4, - bts_qual_max = (1 << bts_qual_bit_size), -}; - -struct bts_struct { - __u64 qualifier; - union { - /* BTS_BRANCH */ - struct { - __u64 from; - __u64 to; - } lbr; - /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ - struct { - __u64 clock; - pid_t pid; - } event; - } variant; -}; - - -/* - * The BTS state. - * - * This gives access to the raw DS state and adds functions to provide - * an arch-independent view of the BTS data. - */ -struct bts_trace { - struct ds_trace ds; - - int (*read)(struct bts_tracer *tracer, const void *at, - struct bts_struct *out); - int (*write)(struct bts_tracer *tracer, const struct bts_struct *in); -}; - - -/* - * The PEBS state. - * - * This gives access to the raw DS state and the PEBS-specific counter - * reset value. - */ -struct pebs_trace { - struct ds_trace ds; - - /* the number of valid counters in the below array */ - unsigned int counters; - -#define MAX_PEBS_COUNTERS 4 - /* the counter reset value */ - unsigned long long counter_reset[MAX_PEBS_COUNTERS]; -}; - - -/* - * Read the BTS or PEBS trace. - * - * Returns a view on the trace collected for the parameter tracer. - * - * The view remains valid as long as the traced task is not running or - * the tracer is suspended. - * Writes into the trace buffer are not reflected. - * - * tracer: the tracer handle returned from ds_request_~() - */ -extern const struct bts_trace *ds_read_bts(struct bts_tracer *tracer); -extern const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer); - - -/* - * Reset the write pointer of the BTS/PEBS buffer. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_~() - */ -extern int ds_reset_bts(struct bts_tracer *tracer); -extern int ds_reset_pebs(struct pebs_tracer *tracer); - -/* - * Set the PEBS counter reset value. - * - * Returns 0 on success; -Eerrno on error - * - * tracer: the tracer handle returned from ds_request_pebs() - * counter: the index of the counter - * value: the new counter reset value - */ -extern int ds_set_pebs_reset(struct pebs_tracer *tracer, - unsigned int counter, u64 value); - -/* - * Initialization - */ -struct cpuinfo_x86; -extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); - -/* - * Context switch work - */ -extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); - -#else /* CONFIG_X86_DS */ - -struct cpuinfo_x86; -static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} -static inline void ds_switch_to(struct task_struct *prev, - struct task_struct *next) {} - -#endif /* CONFIG_X86_DS */ -#endif /* _ASM_X86_DS_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/dwarf2.h linux-2.6.32.ovz/arch/x86/include/asm/dwarf2.h --- linux-2.6.32/arch/x86/include/asm/dwarf2.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/dwarf2.h 2015-03-10 13:23:17.000000000 -0700 @@ -34,6 +34,18 @@ #define CFI_SIGNAL_FRAME #endif +#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) + /* + * Emit CFI data in .debug_frame sections, not .eh_frame sections. + * The latter we currently just discard since we don't do DWARF + * unwinding at runtime. So only the offline DWARF information is + * useful to anyone. Note we should not use this directive if this + * file is used in the vDSO assembly, or if vmlinux.lds.S gets + * changed so it doesn't discard .eh_frame. + */ + .cfi_sections .debug_frame +#endif + #else /* diff -uprN linux-2.6.32/arch/x86/include/asm/e820.h linux-2.6.32.ovz/arch/x86/include/asm/e820.h --- linux-2.6.32/arch/x86/include/asm/e820.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/e820.h 2015-03-10 13:22:50.000000000 -0700 @@ -45,7 +45,12 @@ #define E820_NVS 4 #define E820_UNUSABLE 5 -/* reserved RAM used by kernel itself */ +/* + * reserved RAM used by kernel itself + * if CONFIG_INTEL_TXT is enabled, memory of this type will be + * included in the S3 integrity calculation and so should not include + * any memory that BIOS might alter over the S3 transition + */ #define E820_RESERVED_KERN 128 #ifndef __ASSEMBLY__ @@ -61,6 +66,15 @@ struct e820map { struct e820entry map[E820_X_MAX]; }; +#define ISA_START_ADDRESS 0xa0000 +#define ISA_END_ADDRESS 0x100000 + +#define BIOS_BEGIN 0x000a0000 +#define BIOS_END 0x00100000 + +#define BIOS_ROM_BASE 0xffe00000 +#define BIOS_ROM_END 0xffffffff + #ifdef __KERNEL__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map e820; @@ -70,6 +84,7 @@ extern unsigned long pci_mem_start; extern int e820_any_mapped(u64 start, u64 end, unsigned type); extern int e820_all_mapped(u64 start, u64 end, unsigned type); extern void e820_add_region(u64 start, u64 size, int type); +extern void e820_saved_add_region(u64 start, u64 size, int type); extern void e820_print_map(char *who); extern int sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, u32 *pnr_map); @@ -126,15 +141,18 @@ extern void e820_reserve_resources(void) extern void e820_reserve_resources_late(void); extern void setup_memory_map(void); extern char *default_machine_specific_memory_setup(void); -#endif /* __KERNEL__ */ -#endif /* __ASSEMBLY__ */ -#define ISA_START_ADDRESS 0xa0000 -#define ISA_END_ADDRESS 0x100000 -#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) +/* + * Returns true iff the specified range [s,e) is completely contained inside + * the ISA region. + */ +static inline bool is_ISA_range(u64 s, u64 e) +{ + return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS; +} -#define BIOS_BEGIN 0x000a0000 -#define BIOS_END 0x00100000 +#endif /* __KERNEL__ */ +#endif /* __ASSEMBLY__ */ #ifdef __KERNEL__ #include diff -uprN linux-2.6.32/arch/x86/include/asm/efi.h linux-2.6.32.ovz/arch/x86/include/asm/efi.h --- linux-2.6.32/arch/x86/include/asm/efi.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/efi.h 2015-03-10 13:24:10.000000000 -0700 @@ -90,11 +90,17 @@ extern void __iomem *efi_ioremap(unsigne #endif /* CONFIG_X86_32 */ extern int add_efi_memmap; +extern void efi_set_executable(efi_memory_desc_t *md, bool executable); extern void efi_reserve_early(void); extern void efi_call_phys_prelog(void); extern void efi_call_phys_epilog(void); +extern void efi_call_phys_prelog_in_physmode(void); +extern void efi_call_phys_epilog_in_physmode(void); +extern void efi_pagetable_init(void); -#ifndef CONFIG_EFI +#ifdef CONFIG_EFI +extern struct console early_efi_console; +#else /* * IF EFI is not configured, have the EFI calls return -ENOSYS. */ diff -uprN linux-2.6.32/arch/x86/include/asm/elf.h linux-2.6.32.ovz/arch/x86/include/asm/elf.h --- linux-2.6.32/arch/x86/include/asm/elf.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/elf.h 2015-06-23 04:16:16.000000000 -0700 @@ -4,6 +4,7 @@ /* * ELF register definitions.. */ +#include #include #include @@ -197,14 +198,8 @@ do { \ set_fs(USER_DS); \ } while (0) -#define COMPAT_SET_PERSONALITY(ex) \ -do { \ - if (test_thread_flag(TIF_IA32)) \ - clear_thread_flag(TIF_ABI_PENDING); \ - else \ - set_thread_flag(TIF_ABI_PENDING); \ - current->personality |= force_personality32; \ -} while (0) +void set_personality_ia32(void); +#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32() #define COMPAT_ELF_PLATFORM ("i686") @@ -338,12 +333,47 @@ struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); + int uses_interp, + unsigned long map_address); +extern int arch_setup_additional_pages_rhel5(struct linux_binprm *bprm, + int uses_interp, + unsigned long map_address); -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); +extern int syscall32_setup_pages(struct linux_binprm *, int exstack, + unsigned long map_address); #define compat_arch_setup_additional_pages syscall32_setup_pages extern unsigned long arch_randomize_brk(struct mm_struct *mm); #define arch_randomize_brk arch_randomize_brk +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static inline int mmap_is_ia32(void) +{ +#ifdef CONFIG_X86_32 + return 1; +#endif +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + return 1; +#endif + return 0; +} + +/* The first two values are special, do not change. See align_addr() */ +enum align_flags { + ALIGN_VA_32 = BIT(0), + ALIGN_VA_64 = BIT(1), + ALIGN_VDSO = BIT(2), + ALIGN_TOPDOWN = BIT(3), +}; + +struct va_alignment { + int flags; + unsigned long mask; +} ____cacheline_aligned; + +extern struct va_alignment va_align; +extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); #endif /* _ASM_X86_ELF_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/entry_arch.h linux-2.6.32.ovz/arch/x86/include/asm/entry_arch.h --- linux-2.6.32/arch/x86/include/asm/entry_arch.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/entry_arch.h 2015-06-23 04:16:16.000000000 -0700 @@ -13,8 +13,9 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) -BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) -BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) +BUILD_INTERRUPT3(irq_move_cleanup_interrupt, IRQ_MOVE_CLEANUP_VECTOR, + smp_irq_move_cleanup_interrupt) +BUILD_INTERRUPT3(reboot_interrupt, REBOOT_VECTOR, smp_reboot_interrupt) BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, smp_invalidate_interrupt) @@ -34,7 +35,9 @@ BUILD_INTERRUPT3(invalidate_interrupt7,I smp_invalidate_interrupt) #endif -BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) +BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) +BUILD_INTERRUPT(monitor_ipi, MONITOR_IPI_VECTOR) +BUILD_INTERRUPT(monitor_posted_interrupt, MONITOR_POSTED_INTERRUPT_VECTOR) /* * every pentium local APIC has two 'local interrupts', with a @@ -49,8 +52,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOC BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_EVENTS -BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) +#ifdef CONFIG_IRQ_WORK +BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR) #endif #ifdef CONFIG_X86_THERMAL_VECTOR @@ -62,7 +65,7 @@ BUILD_INTERRUPT(threshold_interrupt,THRE #endif #ifdef CONFIG_X86_MCE -BUILD_INTERRUPT(mce_self_interrupt,MCE_SELF_VECTOR) +BUILD_INTERRUPT3(mce_self_interrupt, MCE_SELF_VECTOR, smp_mce_self_interrupt) #endif #endif diff -uprN linux-2.6.32/arch/x86/include/asm/fb.h linux-2.6.32.ovz/arch/x86/include/asm/fb.h --- linux-2.6.32/arch/x86/include/asm/fb.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/fb.h 2015-03-10 13:21:09.000000000 -0700 @@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct f pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; } -#ifdef CONFIG_X86_32 extern int fb_is_primary_device(struct fb_info *info); -#else -static inline int fb_is_primary_device(struct fb_info *info) { return 0; } -#endif #endif /* _ASM_X86_FB_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/fixmap.h linux-2.6.32.ovz/arch/x86/include/asm/fixmap.h --- linux-2.6.32/arch/x86/include/asm/fixmap.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/fixmap.h 2015-03-10 13:21:23.000000000 -0700 @@ -82,6 +82,9 @@ enum fixed_addresses { #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, +#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT + FIX_OHCI1394_BASE, +#endif #ifdef CONFIG_X86_LOCAL_APIC FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ #endif @@ -126,9 +129,6 @@ enum fixed_addresses { FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - (__end_of_permanent_fixed_addresses & 255), FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, -#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT - FIX_OHCI1394_BASE, -#endif #ifdef CONFIG_X86_32 FIX_WP_TEST, #endif diff -uprN linux-2.6.32/arch/x86/include/asm/ftrace.h linux-2.6.32.ovz/arch/x86/include/asm/ftrace.h --- linux-2.6.32/arch/x86/include/asm/ftrace.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/ftrace.h 2015-03-10 13:23:55.000000000 -0700 @@ -55,4 +55,23 @@ struct dyn_arch_ftrace { #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ + +#if !defined(__ASSEMBLY__) && !defined(COMPILE_OFFSETS) + +#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION) + +/* + * Because ia32 syscalls do not map to x86_64 syscall numbers + * this screws up the trace output when tracing a ia32 task. + * Instead of reporting bogus syscalls, just do not trace them. + * + * If the user realy wants these, then they should use the + * raw syscall tracepoints with filtering. + */ +#define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 1 +extern bool arch_trace_is_compat_syscall(struct pt_regs *regs); + +#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ +#endif /* !__ASSEMBLY__ && !COMPILE_OFFSETS */ + #endif /* _ASM_X86_FTRACE_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/gart.h linux-2.6.32.ovz/arch/x86/include/asm/gart.h --- linux-2.6.32/arch/x86/include/asm/gart.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/gart.h 2015-03-10 13:22:21.000000000 -0700 @@ -17,6 +17,7 @@ extern int fix_aperture; #define GARTEN (1<<0) #define DISGARTCPU (1<<4) #define DISGARTIO (1<<5) +#define DISTLBWALKPRB (1<<6) /* GART cache control register bits. */ #define INVGART (1<<0) @@ -64,21 +65,34 @@ static inline void gart_iommu_hole_init( extern int agp_amd64_init(void); +static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order) +{ + u32 ctl; + + /* + * Don't enable translation but enable GART IO and CPU accesses. + * Also, set DISTLBWALKPRB since GART tables memory is UC. + */ + ctl = order << 1; + + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); +} + static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) { u32 tmp, ctl; - /* address of the mappings table */ - addr >>= 12; - tmp = (u32) addr<<4; - tmp &= ~0xf; - pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); - - /* Enable GART translation for this hammer. */ - pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); - ctl |= GARTEN; - ctl &= ~(DISGARTCPU | DISGARTIO); - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); + /* address of the mappings table */ + addr >>= 12; + tmp = (u32) addr<<4; + tmp &= ~0xf; + pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); + + /* Enable GART translation for this hammer. */ + pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); + ctl |= GARTEN | DISTLBWALKPRB; + ctl &= ~(DISGARTCPU | DISGARTIO); + pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); } static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) diff -uprN linux-2.6.32/arch/x86/include/asm/hardirq.h linux-2.6.32.ovz/arch/x86/include/asm/hardirq.h --- linux-2.6.32/arch/x86/include/asm/hardirq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/hardirq.h 2015-03-10 13:23:07.000000000 -0700 @@ -12,9 +12,13 @@ typedef struct { unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; #endif - unsigned int generic_irqs; /* arch dependent */ + unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; +#ifndef __GENKSYMS__ + unsigned int apic_irq_work_irqs; +#else unsigned int apic_pending_irqs; +#endif #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; @@ -35,7 +39,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpust #define __ARCH_IRQ_STAT -#define inc_irq_stat(member) percpu_add(irq_stat.member, 1) +#define inc_irq_stat(member) percpu_inc(irq_stat.member) #define local_softirq_pending() percpu_read(irq_stat.__softirq_pending) diff -uprN linux-2.6.32/arch/x86/include/asm/highmem.h linux-2.6.32.ovz/arch/x86/include/asm/highmem.h --- linux-2.6.32/arch/x86/include/asm/highmem.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/highmem.h 2015-03-10 13:21:16.000000000 -0700 @@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); struct page *kmap_atomic_to_page(void *ptr); -#ifndef CONFIG_PARAVIRT -#define kmap_atomic_pte(page, type) kmap_atomic(page, type) -#endif - #define flush_cache_kmaps() do { } while (0) extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, diff -uprN linux-2.6.32/arch/x86/include/asm/hpet.h linux-2.6.32.ovz/arch/x86/include/asm/hpet.h --- linux-2.6.32/arch/x86/include/asm/hpet.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/hpet.h 2015-03-10 13:22:50.000000000 -0700 @@ -65,7 +65,9 @@ /* hpet memory map physical address */ extern unsigned long hpet_address; extern unsigned long force_hpet_address; +extern u8 hpet_blockid; extern int hpet_force_user; +extern u8 hpet_msi_disable; extern int is_hpet_enabled(void); extern int hpet_enable(void); extern void hpet_disable(void); @@ -78,9 +80,9 @@ extern void hpet_msi_write(unsigned int extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); #ifdef CONFIG_PCI_MSI -extern int arch_setup_hpet_msi(unsigned int irq); +extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id); #else -static inline int arch_setup_hpet_msi(unsigned int irq) +static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { return -EINVAL; } diff -uprN linux-2.6.32/arch/x86/include/asm/hugetlb.h linux-2.6.32.ovz/arch/x86/include/asm/hugetlb.h --- linux-2.6.32/arch/x86/include/asm/hugetlb.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/hugetlb.h 2015-03-10 13:24:08.000000000 -0700 @@ -2,6 +2,7 @@ #define _ASM_X86_HUGETLB_H #include +#include static inline int is_hugepage_only_range(struct mm_struct *mm, @@ -39,18 +40,21 @@ static inline void hugetlb_free_pgd_rang static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { + mm_track_pmd((pmd_t *)ptep); set_pte_at(mm, addr, ptep, pte); } static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { + mm_track_pmd((pmd_t *)ptep); return ptep_get_and_clear(mm, addr, ptep); } static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { + ptep_clear_flush(vma, addr, ptep); } static inline int huge_pte_none(pte_t pte) diff -uprN linux-2.6.32/arch/x86/include/asm/hw_irq.h linux-2.6.32.ovz/arch/x86/include/asm/hw_irq.h --- linux-2.6.32/arch/x86/include/asm/hw_irq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/hw_irq.h 2015-06-23 04:16:16.000000000 -0700 @@ -27,9 +27,11 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); -extern void generic_interrupt(void); +extern void x86_platform_ipi(void); extern void error_interrupt(void); -extern void perf_pending_interrupt(void); +extern void irq_work_interrupt(void); +extern void monitor_ipi(void); +extern void monitor_posted_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); @@ -53,12 +55,36 @@ extern void threshold_interrupt(void); extern void call_function_interrupt(void); extern void call_function_single_interrupt(void); -/* PIC specific functions */ -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); +#ifdef CONFIG_TRACING +/* Interrupt handlers registered during init_IRQ */ +extern void trace_apic_timer_interrupt(void); +extern void trace_x86_platform_ipi(void); +extern void trace_error_interrupt(void); +extern void trace_irq_work_interrupt(void); +extern void trace_monitor_ipi(void); +extern void trace_monitor_posted_interrupt(void); + +extern void trace_spurious_interrupt(void); +extern void trace_thermal_interrupt(void); +extern void trace_reschedule_interrupt(void); +#define trace_mce_self_interrupt mce_self_interrupt + +#define trace_invalidate_interrupt0 invalidate_interrupt0 +#define trace_invalidate_interrupt1 invalidate_interrupt1 +#define trace_invalidate_interrupt2 invalidate_interrupt2 +#define trace_invalidate_interrupt3 invalidate_interrupt3 +#define trace_invalidate_interrupt4 invalidate_interrupt4 +#define trace_invalidate_interrupt5 invalidate_interrupt5 +#define trace_invalidate_interrupt6 invalidate_interrupt6 +#define trace_invalidate_interrupt7 invalidate_interrupt7 + +#define trace_irq_move_cleanup_interrupt irq_move_cleanup_interrupt +#define trace_reboot_interrupt reboot_interrupt +extern void trace_threshold_interrupt(void); + +extern void trace_call_function_interrupt(void); +extern void trace_call_function_single_interrupt(void); +#endif /* CONFIG_TRACING */ /* IOAPIC */ #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) @@ -66,7 +92,7 @@ extern unsigned long io_apic_irqs; extern void init_VISWS_APIC_irqs(void); extern void setup_IO_APIC(void); -extern void disable_IO_APIC(void); +extern void disable_IO_APIC(int); struct io_apic_irq_attr { int ioapic; @@ -79,14 +105,33 @@ static inline void set_io_apic_irq_attr( int ioapic, int ioapic_pin, int trigger, int polarity) { - irq_attr->ioapic = ioapic; - irq_attr->ioapic_pin = ioapic_pin; - irq_attr->trigger = trigger; - irq_attr->polarity = polarity; + irq_attr->ioapic = ioapic; + irq_attr->ioapic_pin = ioapic_pin; + irq_attr->trigger = trigger; + irq_attr->polarity = polarity; } -extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, - struct io_apic_irq_attr *irq_attr); +/* + * This is performance-critical, we want to do it O(1) + * + * Most irqs are mapped 1:1 with pins. + */ +struct irq_cfg { + struct irq_pin_list *irq_2_pin; + cpumask_var_t domain; + cpumask_var_t old_domain; + u8 vector; + u8 move_in_progress : 1; +}; + +extern struct irq_cfg *irq_cfg(unsigned int); +extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); +extern void send_cleanup_vector(struct irq_cfg *); + +struct irq_desc; +extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *, + unsigned int *dest_id); +extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); extern void setup_ioapic_dest(void); extern void enable_IO_APIC(void); @@ -101,8 +146,10 @@ extern void eisa_set_level_irq(unsigned /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); -extern void smp_generic_interrupt(struct pt_regs *); +extern void smp_x86_platform_ipi(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); +extern void smp_monitor_ipi(struct pt_regs *); +extern void smp_monitor_posted_interrupt(struct pt_regs *); #ifdef CONFIG_X86_IO_APIC extern asmlinkage void smp_irq_move_cleanup_interrupt(void); #endif diff -uprN linux-2.6.32/arch/x86/include/asm/hyperv.h linux-2.6.32.ovz/arch/x86/include/asm/hyperv.h --- linux-2.6.32/arch/x86/include/asm/hyperv.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/hyperv.h 2015-03-10 13:24:12.000000000 -0700 @@ -0,0 +1,213 @@ +#ifndef _ASM_X86_HYPERV_H +#define _ASM_X86_HYPERV_H + +#include + +/* + * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent + * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). + */ +#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000 +#define HYPERV_CPUID_INTERFACE 0x40000001 +#define HYPERV_CPUID_VERSION 0x40000002 +#define HYPERV_CPUID_FEATURES 0x40000003 +#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004 +#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005 + +#define HYPERV_HYPERVISOR_PRESENT_BIT 0x80000000 +#define HYPERV_CPUID_MIN 0x40000005 +#define HYPERV_CPUID_MAX 0x4000ffff + +/* + * Feature identification. EAX indicates which features are available + * to the partition based upon the current partition privileges. + */ + +/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0) +/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ +#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1) + +/* + * There is a single feature flag that signifies the presence of the MSR + * that can be used to retrieve both the local APIC Timer frequency as + * well as the TSC frequency. + */ + +/* Local APIC timer frequency MSR (HV_X64_MSR_APIC_FREQUENCY) is available */ +#define HV_X64_MSR_APIC_FREQUENCY_AVAILABLE (1 << 11) + +/* TSC frequency MSR (HV_X64_MSR_TSC_FREQUENCY) is available */ +#define HV_X64_MSR_TSC_FREQUENCY_AVAILABLE (1 << 11) + +/* + * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM + * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available + */ +#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2) +/* + * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through + * HV_X64_MSR_STIMER3_COUNT) available + */ +#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3) +/* + * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) + * are available + */ +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4) +/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ +#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5) +/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ +#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6) +/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ +#define HV_X64_MSR_RESET_AVAILABLE (1 << 7) + /* + * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, + * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, + * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available + */ +#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) + +/* + * Feature identification: EBX indicates which flags were specified at + * partition creation. The format is the same as the partition creation + * flag structure defined in section Partition Creation Flags. + */ +#define HV_X64_CREATE_PARTITIONS (1 << 0) +#define HV_X64_ACCESS_PARTITION_ID (1 << 1) +#define HV_X64_ACCESS_MEMORY_POOL (1 << 2) +#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3) +#define HV_X64_POST_MESSAGES (1 << 4) +#define HV_X64_SIGNAL_EVENTS (1 << 5) +#define HV_X64_CREATE_PORT (1 << 6) +#define HV_X64_CONNECT_PORT (1 << 7) +#define HV_X64_ACCESS_STATS (1 << 8) +#define HV_X64_DEBUGGING (1 << 11) +#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12) +#define HV_X64_CONFIGURE_PROFILER (1 << 13) + +/* + * Feature identification. EDX indicates which miscellaneous features + * are available to the partition. + */ +/* The MWAIT instruction is available (per section MONITOR / MWAIT) */ +#define HV_X64_MWAIT_AVAILABLE (1 << 0) +/* Guest debugging support is available */ +#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1) +/* Performance Monitor support is available*/ +#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2) +/* Support for physical CPU dynamic partitioning events is available*/ +#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3) +/* + * Support for passing hypercall input parameter block via XMM + * registers is available + */ +#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4) +/* Support for a virtual guest idle state is available */ +#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5) + +/* + * Implementation recommendations. Indicates which behaviors the hypervisor + * recommends the OS implement for optimal performance. + */ + /* + * Recommend using hypercall for address space switches rather + * than MOV to CR3 instruction + */ +#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +/* Recommend using hypercall for local TLB flushes rather + * than INVLPG or MOV to CR3 instructions */ +#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) +/* + * Recommend using hypercall for remote TLB flushes rather + * than inter-processor interrupts + */ +#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2) +/* + * Recommend using MSRs for accessing APIC registers + * EOI, ICR and TPR rather than their memory-mapped counterparts + */ +#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3) +/* Recommend using the hypervisor-provided MSR to initiate a system RESET */ +#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4) +/* + * Recommend using relaxed timing for this partition. If used, + * the VM should disable any watchdog timeouts that rely on the + * timely delivery of external interrupts + */ +#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) + +/* MSR used to identify the guest OS. */ +#define HV_X64_MSR_GUEST_OS_ID 0x40000000 + +/* MSR used to setup pages used to communicate with the hypervisor. */ +#define HV_X64_MSR_HYPERCALL 0x40000001 + +/* MSR used to provide vcpu index */ +#define HV_X64_MSR_VP_INDEX 0x40000002 + +/* MSR used to read the per-partition time reference counter */ +#define HV_X64_MSR_TIME_REF_COUNT 0x40000020 + +/* MSR used to retrieve the TSC frequency */ +#define HV_X64_MSR_TSC_FREQUENCY 0x40000022 + +/* MSR used to retrieve the local APIC timer frequency */ +#define HV_X64_MSR_APIC_FREQUENCY 0x40000023 + +/* Define the virtual APIC registers */ +#define HV_X64_MSR_EOI 0x40000070 +#define HV_X64_MSR_ICR 0x40000071 +#define HV_X64_MSR_TPR 0x40000072 +#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073 + +/* Define synthetic interrupt controller model specific registers. */ +#define HV_X64_MSR_SCONTROL 0x40000080 +#define HV_X64_MSR_SVERSION 0x40000081 +#define HV_X64_MSR_SIEFP 0x40000082 +#define HV_X64_MSR_SIMP 0x40000083 +#define HV_X64_MSR_EOM 0x40000084 +#define HV_X64_MSR_SINT0 0x40000090 +#define HV_X64_MSR_SINT1 0x40000091 +#define HV_X64_MSR_SINT2 0x40000092 +#define HV_X64_MSR_SINT3 0x40000093 +#define HV_X64_MSR_SINT4 0x40000094 +#define HV_X64_MSR_SINT5 0x40000095 +#define HV_X64_MSR_SINT6 0x40000096 +#define HV_X64_MSR_SINT7 0x40000097 +#define HV_X64_MSR_SINT8 0x40000098 +#define HV_X64_MSR_SINT9 0x40000099 +#define HV_X64_MSR_SINT10 0x4000009A +#define HV_X64_MSR_SINT11 0x4000009B +#define HV_X64_MSR_SINT12 0x4000009C +#define HV_X64_MSR_SINT13 0x4000009D +#define HV_X64_MSR_SINT14 0x4000009E +#define HV_X64_MSR_SINT15 0x4000009F + + +#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) + +/* Declare the various hypercall operations. */ +#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008 + +#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12 +#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \ + (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1)) + +#define HV_PROCESSOR_POWER_STATE_C0 0 +#define HV_PROCESSOR_POWER_STATE_C1 1 +#define HV_PROCESSOR_POWER_STATE_C2 2 +#define HV_PROCESSOR_POWER_STATE_C3 3 + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INSUFFICIENT_BUFFERS 19 + +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/hypervisor.h linux-2.6.32.ovz/arch/x86/include/asm/hypervisor.h --- linux-2.6.32/arch/x86/include/asm/hypervisor.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/hypervisor.h 2015-03-10 13:23:54.000000000 -0700 @@ -17,10 +17,40 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * */ -#ifndef ASM_X86__HYPERVISOR_H -#define ASM_X86__HYPERVISOR_H +#ifndef _ASM_X86_HYPERVISOR_H +#define _ASM_X86_HYPERVISOR_H + +#include extern void init_hypervisor(struct cpuinfo_x86 *c); extern void init_hypervisor_platform(void); +extern bool hypervisor_x2apic_available(void); + +/* + * x86 hypervisor information + */ +struct hypervisor_x86 { + /* Hypervisor name */ + const char *name; + + /* Detection routine */ + bool (*detect)(void); + + /* Adjust CPU feature bits (run once per CPU) */ + void (*set_cpu_features)(struct cpuinfo_x86 *); + + /* Platform setup (run once per boot) */ + void (*init_platform)(void); + + /* X2APIC detection (run once per boot) */ + bool (*x2apic_available)(void); +}; + +extern const struct hypervisor_x86 *x86_hyper; + +/* Recognized hypervisors */ +extern const struct hypervisor_x86 x86_hyper_vmware; +extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +extern const struct hypervisor_x86 x86_hyper_kvm; #endif diff -uprN linux-2.6.32/arch/x86/include/asm/i387.h linux-2.6.32.ovz/arch/x86/include/asm/i387.h --- linux-2.6.32/arch/x86/include/asm/i387.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/i387.h 2015-03-10 13:22:38.000000000 -0700 @@ -10,11 +10,15 @@ #ifndef _ASM_X86_I387_H #define _ASM_X86_I387_H +#ifndef __ASSEMBLY__ + #include #include #include #include +#include #include +#include #include #include #include @@ -31,8 +35,16 @@ extern void init_thread_xstate(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); extern user_regset_active_fn fpregs_active, xfpregs_active; -extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; -extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; +extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, + xstateregs_get; +extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set, + xstateregs_set; + +/* + * xstateregs_active == fpregs_active. Please refer to the comment + * at the definition of fpregs_active. + */ +#define xstateregs_active fpregs_active extern struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION @@ -46,6 +58,25 @@ extern int restore_i387_xstate_ia32(void #define X87_FSW_ES (1 << 7) /* Exception Summary */ +static __always_inline __pure bool use_xsaveopt(void) +{ + return static_cpu_has(X86_FEATURE_XSAVEOPT); +} + +static __always_inline __pure bool use_xsave(void) +{ + return static_cpu_has(X86_FEATURE_XSAVE); +} + +extern void __sanitize_i387_state(struct task_struct *); + +static inline void sanitize_i387_state(struct task_struct *tsk) +{ + if (!use_xsaveopt()) + return; + __sanitize_i387_state(tsk); +} + #ifdef CONFIG_X86_64 /* Ignore delayed exceptions from user space */ @@ -81,15 +112,15 @@ static inline int fxrstor_checking(struc values. The kernel data segment can be sometimes 0 and sometimes new user value. Both should be ok. Use the PDA as safe address because it should be already in L1. */ -static inline void clear_fpu_state(struct task_struct *tsk) +static inline void fpu_clear(struct fpu *fpu) { - struct xsave_struct *xstate = &tsk->thread.xstate->xsave; - struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; + struct xsave_struct *xstate = &fpu->state->xsave; + struct i387_fxsave_struct *fx = &fpu->state->fxsave; /* * xsave header may indicate the init state of the FP. */ - if ((task_thread_info(tsk)->status & TS_XSAVE) && + if (use_xsave() && !(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) return; @@ -101,6 +132,11 @@ static inline void clear_fpu_state(struc X86_FEATURE_FXSAVE_LEAK); } +static inline void clear_fpu_state(struct task_struct *tsk) +{ + fpu_clear((struct fpu *)&tsk->thread.xstate); +} + static inline int fxsave_user(struct i387_fxsave_struct __user *fx) { int err; @@ -125,7 +161,7 @@ static inline int fxsave_user(struct i38 return err; } -static inline void fxsave(struct task_struct *tsk) +static inline void fpu_fxsave(struct fpu *fpu) { /* Using "rex64; fxsave %0" is broken because, if the memory operand uses any extended registers for addressing, a second REX prefix @@ -135,42 +171,45 @@ static inline void fxsave(struct task_st /* Using "fxsaveq %0" would be the ideal choice, but is only supported starting with gas 2.16. */ __asm__ __volatile__("fxsaveq %0" - : "=m" (tsk->thread.xstate->fxsave)); + : "=m" (fpu->state->fxsave)); #elif 0 /* Using, as a workaround, the properly prefixed form below isn't accepted by any binutils version so far released, complaining that the same type of prefix is used twice if an extended register is needed for addressing (fix submitted to mainline 2005-11-21). */ __asm__ __volatile__("rex64/fxsave %0" - : "=m" (tsk->thread.xstate->fxsave)); + : "=m" (fpu->state->fxsave)); #else /* This, however, we can work around by forcing the compiler to select an addressing mode that doesn't require extended registers. */ __asm__ __volatile__("rex64/fxsave (%1)" - : "=m" (tsk->thread.xstate->fxsave) - : "cdaSDb" (&tsk->thread.xstate->fxsave)); + : "=m" (fpu->state->fxsave) + : "cdaSDb" (&fpu->state->fxsave)); #endif } -static inline void __save_init_fpu(struct task_struct *tsk) +static inline void fpu_save_init(struct fpu *fpu) { - if (task_thread_info(tsk)->status & TS_XSAVE) - xsave(tsk); + if (use_xsave()) + fpu_xsave(fpu); else - fxsave(tsk); + fpu_fxsave(fpu); + + fpu_clear(fpu); +} - clear_fpu_state(tsk); +static inline void __save_init_fpu(struct task_struct *tsk) +{ + fpu_save_init((struct fpu *)&tsk->thread.xstate); task_thread_info(tsk)->status &= ~TS_USEDFPU; } #else /* CONFIG_X86_32 */ #ifdef CONFIG_MATH_EMULATION -extern void finit_task(struct task_struct *tsk); +extern void finit_soft_fpu(struct i387_soft_struct *soft); #else -static inline void finit_task(struct task_struct *tsk) -{ -} +static inline void finit_soft_fpu(struct i387_soft_struct *soft) {} #endif static inline void tolerant_fwait(void) @@ -206,13 +245,13 @@ static inline int fxrstor_checking(struc /* * These must be called with preempt disabled */ -static inline void __save_init_fpu(struct task_struct *tsk) +static inline void fpu_save_init(struct fpu *fpu) { - if (task_thread_info(tsk)->status & TS_XSAVE) { - struct xsave_struct *xstate = &tsk->thread.xstate->xsave; - struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; + if (use_xsave()) { + struct xsave_struct *xstate = &fpu->state->xsave; + struct i387_fxsave_struct *fx = &fpu->state->fxsave; - xsave(tsk); + fpu_xsave(fpu); /* * xsave header may indicate the init state of the FP. @@ -236,8 +275,8 @@ static inline void __save_init_fpu(struc "fxsave %[fx]\n" "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:", X86_FEATURE_FXSR, - [fx] "m" (tsk->thread.xstate->fxsave), - [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory"); + [fx] "m" (fpu->state->fxsave), + [fsw] "m" (fpu->state->fxsave.swd) : "memory"); clear_state: /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed @@ -249,17 +288,34 @@ clear_state: X86_FEATURE_FXSAVE_LEAK, [addr] "m" (safe_address)); end: + ; +} + +static inline void __save_init_fpu(struct task_struct *tsk) +{ + fpu_save_init((struct fpu *)&tsk->thread.xstate); task_thread_info(tsk)->status &= ~TS_USEDFPU; } + #endif /* CONFIG_X86_64 */ -static inline int restore_fpu_checking(struct task_struct *tsk) +static inline int fpu_fxrstor_checking(struct fpu *fpu) { - if (task_thread_info(tsk)->status & TS_XSAVE) - return xrstor_checking(&tsk->thread.xstate->xsave); + return fxrstor_checking(&fpu->state->fxsave); +} + +static inline int fpu_restore_checking(struct fpu *fpu) +{ + if (use_xsave()) + return fpu_xrstor_checking(fpu); else - return fxrstor_checking(&tsk->thread.xstate->fxsave); + return fpu_fxrstor_checking(fpu); +} + +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + return fpu_restore_checking((struct fpu *)&tsk->thread.xstate); } /* @@ -411,4 +467,40 @@ static inline unsigned short get_fpu_mxc } } +static bool fpu_allocated(struct fpu *fpu) +{ + return fpu->state != NULL; +} + +static inline int fpu_alloc(struct fpu *fpu) +{ + if (fpu_allocated(fpu)) + return 0; + fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL); + if (!fpu->state) + return -ENOMEM; + WARN_ON((unsigned long)fpu->state & 15); + return 0; +} + +static inline void fpu_free(struct fpu *fpu) +{ + if (fpu->state) { + kmem_cache_free(task_xstate_cachep, fpu->state); + fpu->state = NULL; + } +} + +static inline void fpu_copy(struct fpu *dst, struct fpu *src) +{ + memcpy(dst->state, src->state, xstate_size); +} + +extern void fpu_finit(struct fpu *fpu); + +#endif /* __ASSEMBLY__ */ + +#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5 +#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5 + #endif /* _ASM_X86_I387_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/i8259.h linux-2.6.32.ovz/arch/x86/include/asm/i8259.h --- linux-2.6.32/arch/x86/include/asm/i8259.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/i8259.h 2015-03-10 13:24:12.000000000 -0700 @@ -26,11 +26,6 @@ extern unsigned int cached_irq_mask; extern spinlock_t i8259A_lock; -extern void init_8259A(int auto_eoi); -extern void enable_8259A_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern unsigned int startup_8259A_irq(unsigned int irq); - /* the PIC may need a careful delay on some platforms, hence specific calls */ static inline unsigned char inb_pic(unsigned int port) { @@ -57,7 +52,18 @@ static inline void outb_pic(unsigned cha extern struct irq_chip i8259A_chip; -extern void mask_8259A(void); -extern void unmask_8259A(void); +struct legacy_pic { + int nr_legacy_irqs; + struct irq_chip *chip; + void (*mask_all)(void); + void (*restore_mask)(void); + void (*init)(int auto_eoi); + int (*irq_pending)(unsigned int irq); + void (*make_irq)(unsigned int irq); +}; + +extern struct legacy_pic *legacy_pic; +extern struct legacy_pic null_legacy_pic; +extern int using_null_legacy_pic; #endif /* _ASM_X86_I8259_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/inat.h linux-2.6.32.ovz/arch/x86/include/asm/inat.h --- linux-2.6.32/arch/x86/include/asm/inat.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/inat.h 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,220 @@ +#ifndef _ASM_X86_INAT_H +#define _ASM_X86_INAT_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include + +/* + * Internal bits. Don't use bitmasks directly, because these bits are + * unstable. You should use checking functions. + */ + +#define INAT_OPCODE_TABLE_SIZE 256 +#define INAT_GROUP_TABLE_SIZE 8 + +/* Legacy last prefixes */ +#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */ +#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */ +#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */ +/* Other Legacy prefixes */ +#define INAT_PFX_LOCK 4 /* 0xF0 */ +#define INAT_PFX_CS 5 /* 0x2E */ +#define INAT_PFX_DS 6 /* 0x3E */ +#define INAT_PFX_ES 7 /* 0x26 */ +#define INAT_PFX_FS 8 /* 0x64 */ +#define INAT_PFX_GS 9 /* 0x65 */ +#define INAT_PFX_SS 10 /* 0x36 */ +#define INAT_PFX_ADDRSZ 11 /* 0x67 */ +/* x86-64 REX prefix */ +#define INAT_PFX_REX 12 /* 0x4X */ +/* AVX VEX prefixes */ +#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */ +#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */ + +#define INAT_LSTPFX_MAX 3 +#define INAT_LGCPFX_MAX 11 + +/* Immediate size */ +#define INAT_IMM_BYTE 1 +#define INAT_IMM_WORD 2 +#define INAT_IMM_DWORD 3 +#define INAT_IMM_QWORD 4 +#define INAT_IMM_PTR 5 +#define INAT_IMM_VWORD32 6 +#define INAT_IMM_VWORD 7 + +/* Legacy prefix */ +#define INAT_PFX_OFFS 0 +#define INAT_PFX_BITS 4 +#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1) +#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS) +/* Escape opcodes */ +#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS) +#define INAT_ESC_BITS 2 +#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1) +#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS) +/* Group opcodes (1-16) */ +#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS) +#define INAT_GRP_BITS 5 +#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1) +#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS) +/* Immediates */ +#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS) +#define INAT_IMM_BITS 3 +#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS) +/* Flags */ +#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS) +#define INAT_MODRM (1 << (INAT_FLAG_OFFS)) +#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1)) +#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2)) +#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3)) +#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4)) +#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5)) +#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6)) +/* Attribute making macros for attribute tables */ +#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS) +#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS) +#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) +#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) + +/* Attribute search APIs */ +extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); +extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, + insn_byte_t last_pfx, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm, + insn_byte_t last_pfx, + insn_attr_t esc_attr); +extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, + insn_byte_t vex_m, + insn_byte_t vex_pp); + +/* Attribute checking functions */ +static inline int inat_is_legacy_prefix(insn_attr_t attr) +{ + attr &= INAT_PFX_MASK; + return attr && attr <= INAT_LGCPFX_MAX; +} + +static inline int inat_is_address_size_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ; +} + +static inline int inat_is_operand_size_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ; +} + +static inline int inat_is_rex_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_REX; +} + +static inline int inat_last_prefix_id(insn_attr_t attr) +{ + if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX) + return 0; + else + return attr & INAT_PFX_MASK; +} + +static inline int inat_is_vex_prefix(insn_attr_t attr) +{ + attr &= INAT_PFX_MASK; + return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3; +} + +static inline int inat_is_vex3_prefix(insn_attr_t attr) +{ + return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3; +} + +static inline int inat_is_escape(insn_attr_t attr) +{ + return attr & INAT_ESC_MASK; +} + +static inline int inat_escape_id(insn_attr_t attr) +{ + return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS; +} + +static inline int inat_is_group(insn_attr_t attr) +{ + return attr & INAT_GRP_MASK; +} + +static inline int inat_group_id(insn_attr_t attr) +{ + return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS; +} + +static inline int inat_group_common_attribute(insn_attr_t attr) +{ + return attr & ~INAT_GRP_MASK; +} + +static inline int inat_has_immediate(insn_attr_t attr) +{ + return attr & INAT_IMM_MASK; +} + +static inline int inat_immediate_size(insn_attr_t attr) +{ + return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS; +} + +static inline int inat_has_modrm(insn_attr_t attr) +{ + return attr & INAT_MODRM; +} + +static inline int inat_is_force64(insn_attr_t attr) +{ + return attr & INAT_FORCE64; +} + +static inline int inat_has_second_immediate(insn_attr_t attr) +{ + return attr & INAT_SCNDIMM; +} + +static inline int inat_has_moffset(insn_attr_t attr) +{ + return attr & INAT_MOFFSET; +} + +static inline int inat_has_variant(insn_attr_t attr) +{ + return attr & INAT_VARIANT; +} + +static inline int inat_accept_vex(insn_attr_t attr) +{ + return attr & INAT_VEXOK; +} + +static inline int inat_must_vex(insn_attr_t attr) +{ + return attr & INAT_VEXONLY; +} +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/inat_types.h linux-2.6.32.ovz/arch/x86/include/asm/inat_types.h --- linux-2.6.32/arch/x86/include/asm/inat_types.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/inat_types.h 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,29 @@ +#ifndef _ASM_X86_INAT_TYPES_H +#define _ASM_X86_INAT_TYPES_H +/* + * x86 instruction attributes + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ + +/* Instruction attributes */ +typedef unsigned int insn_attr_t; +typedef unsigned char insn_byte_t; +typedef signed int insn_value_t; + +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/insn.h linux-2.6.32.ovz/arch/x86/include/asm/insn.h --- linux-2.6.32/arch/x86/include/asm/insn.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/insn.h 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,186 @@ +#ifndef _ASM_X86_INSN_H +#define _ASM_X86_INSN_H +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +/* insn_attr_t is defined in inat.h */ +#include + +struct insn_field { + union { + insn_value_t value; + insn_byte_t bytes[4]; + }; + /* !0 if we've run insn_get_xxx() for this field */ + unsigned char got; + unsigned char nbytes; +}; + +struct insn { + struct insn_field prefixes; /* + * Prefixes + * prefixes.bytes[3]: last prefix + */ + struct insn_field rex_prefix; /* REX prefix */ + struct insn_field vex_prefix; /* VEX prefix */ + struct insn_field opcode; /* + * opcode.bytes[0]: opcode1 + * opcode.bytes[1]: opcode2 + * opcode.bytes[2]: opcode3 + */ + struct insn_field modrm; + struct insn_field sib; + struct insn_field displacement; + union { + struct insn_field immediate; + struct insn_field moffset1; /* for 64bit MOV */ + struct insn_field immediate1; /* for 64bit imm or off16/32 */ + }; + union { + struct insn_field moffset2; /* for 64bit MOV */ + struct insn_field immediate2; /* for 64bit imm or seg16 */ + }; + + insn_attr_t attr; + unsigned char opnd_bytes; + unsigned char addr_bytes; + unsigned char length; + unsigned char x86_64; + + const insn_byte_t *kaddr; /* kernel address of insn to analyze */ + const insn_byte_t *next_byte; +}; + +#define MAX_INSN_SIZE 16 + +#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) +#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) +#define X86_MODRM_RM(modrm) ((modrm) & 0x07) + +#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) +#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) +#define X86_SIB_BASE(sib) ((sib) & 0x07) + +#define X86_REX_W(rex) ((rex) & 8) +#define X86_REX_R(rex) ((rex) & 4) +#define X86_REX_X(rex) ((rex) & 2) +#define X86_REX_B(rex) ((rex) & 1) + +/* VEX bit flags */ +#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ +#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ +#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ +#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ +#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ +/* VEX bit fields */ +#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ +#define X86_VEX2_M 1 /* VEX2.M always 1 */ +#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ +#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ +#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ + +/* The last prefix is needed for two-byte and three-byte opcodes */ +static inline insn_byte_t insn_last_prefix(struct insn *insn) +{ + return insn->prefixes.bytes[3]; +} + +extern void insn_init(struct insn *insn, const void *kaddr, int x86_64); +extern void insn_get_prefixes(struct insn *insn); +extern void insn_get_opcode(struct insn *insn); +extern void insn_get_modrm(struct insn *insn); +extern void insn_get_sib(struct insn *insn); +extern void insn_get_displacement(struct insn *insn); +extern void insn_get_immediate(struct insn *insn); +extern void insn_get_length(struct insn *insn); + +/* Attribute will be determined after getting ModRM (for opcode groups) */ +static inline void insn_get_attribute(struct insn *insn) +{ + insn_get_modrm(insn); +} + +/* Instruction uses RIP-relative addressing */ +extern int insn_rip_relative(struct insn *insn); + +/* Init insn for kernel text */ +static inline void kernel_insn_init(struct insn *insn, const void *kaddr) +{ +#ifdef CONFIG_X86_64 + insn_init(insn, kaddr, 1); +#else /* CONFIG_X86_32 */ + insn_init(insn, kaddr, 0); +#endif +} + +static inline int insn_is_avx(struct insn *insn) +{ + if (!insn->prefixes.got) + insn_get_prefixes(insn); + return (insn->vex_prefix.value != 0); +} + +static inline insn_byte_t insn_vex_m_bits(struct insn *insn) +{ + if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ + return X86_VEX2_M; + else + return X86_VEX3_M(insn->vex_prefix.bytes[1]); +} + +static inline insn_byte_t insn_vex_p_bits(struct insn *insn) +{ + if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ + return X86_VEX_P(insn->vex_prefix.bytes[1]); + else + return X86_VEX_P(insn->vex_prefix.bytes[2]); +} + +/* Offset of each field from kaddr */ +static inline int insn_offset_rex_prefix(struct insn *insn) +{ + return insn->prefixes.nbytes; +} +static inline int insn_offset_vex_prefix(struct insn *insn) +{ + return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; +} +static inline int insn_offset_opcode(struct insn *insn) +{ + return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; +} +static inline int insn_offset_modrm(struct insn *insn) +{ + return insn_offset_opcode(insn) + insn->opcode.nbytes; +} +static inline int insn_offset_sib(struct insn *insn) +{ + return insn_offset_modrm(insn) + insn->modrm.nbytes; +} +static inline int insn_offset_displacement(struct insn *insn) +{ + return insn_offset_sib(insn) + insn->sib.nbytes; +} +static inline int insn_offset_immediate(struct insn *insn) +{ + return insn_offset_displacement(insn) + insn->displacement.nbytes; +} + +#endif /* _ASM_X86_INSN_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/inst.h linux-2.6.32.ovz/arch/x86/include/asm/inst.h --- linux-2.6.32/arch/x86/include/asm/inst.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/inst.h 2015-03-10 13:23:55.000000000 -0700 @@ -0,0 +1,240 @@ +/* + * Generate .byte code for some instructions not supported by old + * binutils. + */ +#ifndef X86_ASM_INST_H +#define X86_ASM_INST_H + +#ifdef __ASSEMBLY__ + +#define REG_NUM_INVALID 100 + +#define REG_TYPE_R64 0 +#define REG_TYPE_XMM 1 +#define REG_TYPE_INVALID 100 + + .macro R64_NUM opd r64 + \opd = REG_NUM_INVALID + .ifc \r64,%rax + \opd = 0 + .endif + .ifc \r64,%rcx + \opd = 1 + .endif + .ifc \r64,%rdx + \opd = 2 + .endif + .ifc \r64,%rbx + \opd = 3 + .endif + .ifc \r64,%rsp + \opd = 4 + .endif + .ifc \r64,%rbp + \opd = 5 + .endif + .ifc \r64,%rsi + \opd = 6 + .endif + .ifc \r64,%rdi + \opd = 7 + .endif + .ifc \r64,%r8 + \opd = 8 + .endif + .ifc \r64,%r9 + \opd = 9 + .endif + .ifc \r64,%r10 + \opd = 10 + .endif + .ifc \r64,%r11 + \opd = 11 + .endif + .ifc \r64,%r12 + \opd = 12 + .endif + .ifc \r64,%r13 + \opd = 13 + .endif + .ifc \r64,%r14 + \opd = 14 + .endif + .ifc \r64,%r15 + \opd = 15 + .endif + .endm + + .macro XMM_NUM opd xmm + \opd = REG_NUM_INVALID + .ifc \xmm,%xmm0 + \opd = 0 + .endif + .ifc \xmm,%xmm1 + \opd = 1 + .endif + .ifc \xmm,%xmm2 + \opd = 2 + .endif + .ifc \xmm,%xmm3 + \opd = 3 + .endif + .ifc \xmm,%xmm4 + \opd = 4 + .endif + .ifc \xmm,%xmm5 + \opd = 5 + .endif + .ifc \xmm,%xmm6 + \opd = 6 + .endif + .ifc \xmm,%xmm7 + \opd = 7 + .endif + .ifc \xmm,%xmm8 + \opd = 8 + .endif + .ifc \xmm,%xmm9 + \opd = 9 + .endif + .ifc \xmm,%xmm10 + \opd = 10 + .endif + .ifc \xmm,%xmm11 + \opd = 11 + .endif + .ifc \xmm,%xmm12 + \opd = 12 + .endif + .ifc \xmm,%xmm13 + \opd = 13 + .endif + .ifc \xmm,%xmm14 + \opd = 14 + .endif + .ifc \xmm,%xmm15 + \opd = 15 + .endif + .endm + + .macro REG_TYPE type reg + R64_NUM reg_type_r64 \reg + XMM_NUM reg_type_xmm \reg + .if reg_type_r64 <> REG_NUM_INVALID + \type = REG_TYPE_R64 + .elseif reg_type_xmm <> REG_NUM_INVALID + \type = REG_TYPE_XMM + .else + \type = REG_TYPE_INVALID + .endif + .endm + + .macro PFX_OPD_SIZE + .byte 0x66 + .endm + + .macro PFX_REX opd1 opd2 W=0 + .if ((\opd1 | \opd2) & 8) || \W + .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1) | (\W << 3) + .endif + .endm + + .macro MODRM mod opd1 opd2 + .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3) + .endm + + .macro PSHUFB_XMM xmm1 xmm2 + XMM_NUM pshufb_opd1 \xmm1 + XMM_NUM pshufb_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX pshufb_opd1 pshufb_opd2 + .byte 0x0f, 0x38, 0x00 + MODRM 0xc0 pshufb_opd1 pshufb_opd2 + .endm + + .macro PCLMULQDQ imm8 xmm1 xmm2 + XMM_NUM clmul_opd1 \xmm1 + XMM_NUM clmul_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX clmul_opd1 clmul_opd2 + .byte 0x0f, 0x3a, 0x44 + MODRM 0xc0 clmul_opd1 clmul_opd2 + .byte \imm8 + .endm + + .macro AESKEYGENASSIST rcon xmm1 xmm2 + XMM_NUM aeskeygen_opd1 \xmm1 + XMM_NUM aeskeygen_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aeskeygen_opd1 aeskeygen_opd2 + .byte 0x0f, 0x3a, 0xdf + MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2 + .byte \rcon + .endm + + .macro AESIMC xmm1 xmm2 + XMM_NUM aesimc_opd1 \xmm1 + XMM_NUM aesimc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesimc_opd1 aesimc_opd2 + .byte 0x0f, 0x38, 0xdb + MODRM 0xc0 aesimc_opd1 aesimc_opd2 + .endm + + .macro AESENC xmm1 xmm2 + XMM_NUM aesenc_opd1 \xmm1 + XMM_NUM aesenc_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenc_opd1 aesenc_opd2 + .byte 0x0f, 0x38, 0xdc + MODRM 0xc0 aesenc_opd1 aesenc_opd2 + .endm + + .macro AESENCLAST xmm1 xmm2 + XMM_NUM aesenclast_opd1 \xmm1 + XMM_NUM aesenclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesenclast_opd1 aesenclast_opd2 + .byte 0x0f, 0x38, 0xdd + MODRM 0xc0 aesenclast_opd1 aesenclast_opd2 + .endm + + .macro AESDEC xmm1 xmm2 + XMM_NUM aesdec_opd1 \xmm1 + XMM_NUM aesdec_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdec_opd1 aesdec_opd2 + .byte 0x0f, 0x38, 0xde + MODRM 0xc0 aesdec_opd1 aesdec_opd2 + .endm + + .macro AESDECLAST xmm1 xmm2 + XMM_NUM aesdeclast_opd1 \xmm1 + XMM_NUM aesdeclast_opd2 \xmm2 + PFX_OPD_SIZE + PFX_REX aesdeclast_opd1 aesdeclast_opd2 + .byte 0x0f, 0x38, 0xdf + MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2 + .endm + + .macro MOVQ_R64_XMM opd1 opd2 + REG_TYPE movq_r64_xmm_opd1_type \opd1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + XMM_NUM movq_r64_xmm_opd1 \opd1 + R64_NUM movq_r64_xmm_opd2 \opd2 + .else + R64_NUM movq_r64_xmm_opd1 \opd1 + XMM_NUM movq_r64_xmm_opd2 \opd2 + .endif + PFX_OPD_SIZE + PFX_REX movq_r64_xmm_opd1 movq_r64_xmm_opd2 1 + .if movq_r64_xmm_opd1_type == REG_TYPE_XMM + .byte 0x0f, 0x7e + .else + .byte 0x0f, 0x6e + .endif + MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2 + .endm +#endif + +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/io_apic.h linux-2.6.32.ovz/arch/x86/include/asm/io_apic.h --- linux-2.6.32/arch/x86/include/asm/io_apic.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/io_apic.h 2015-03-10 13:24:03.000000000 -0700 @@ -106,6 +106,10 @@ struct IR_IO_APIC_route_entry { index : 15; } __attribute__ ((packed)); +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + #ifdef CONFIG_X86_IO_APIC /* @@ -160,6 +164,7 @@ extern int io_apic_get_redir_entries(int struct io_apic_irq_attr; extern int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr); +void setup_IO_APIC_irq_extra(u32 gsi); extern int (*ioapic_renumber_irq)(int ioapic, int irq); extern void ioapic_init_mappings(void); extern void ioapic_insert_resources(void); @@ -171,6 +176,7 @@ extern void mask_IO_APIC_setup(struct IO extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); extern void probe_nr_irqs_gsi(void); +extern int get_nr_irqs_gsi(void); extern int setup_ioapic_entry(int apic, int irq, struct IO_APIC_route_entry *entry, @@ -185,6 +191,7 @@ struct mp_ioapic_gsi{ int gsi_end; }; extern struct mp_ioapic_gsi mp_gsi_routing[]; +extern u32 gsi_top; int mp_find_ioapic(int gsi); int mp_find_ioapic_pin(int ioapic, int gsi); void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); diff -uprN linux-2.6.32/arch/x86/include/asm/io.h linux-2.6.32.ovz/arch/x86/include/asm/io.h --- linux-2.6.32/arch/x86/include/asm/io.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/io.h 2015-03-10 13:24:10.000000000 -0700 @@ -172,6 +172,7 @@ static inline void __iomem *ioremap(reso extern void iounmap(volatile void __iomem *addr); +extern void set_iounmap_nonlazy(void); #ifdef CONFIG_X86_32 # include "io_32.h" @@ -201,4 +202,11 @@ extern void early_iounmap(void __iomem * #define IO_SPACE_LIMIT 0xffff +#ifdef CONFIG_MTRR +extern int __must_check arch_phys_wc_add(unsigned long base, + unsigned long size); +extern void arch_phys_wc_del(int handle); +#define arch_phys_wc_add arch_phys_wc_add +#endif + #endif /* _ASM_X86_IO_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/irq.h linux-2.6.32.ovz/arch/x86/include/asm/irq.h --- linux-2.6.32/arch/x86/include/asm/irq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/irq.h 2015-03-10 13:24:10.000000000 -0700 @@ -15,10 +15,6 @@ static inline int irq_canonicalize(int i return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_LOCAL_APIC -# define ARCH_HAS_NMI_WATCHDOG -#endif - #ifdef CONFIG_4KSTACKS extern void irq_ctx_init(int cpu); extern void irq_ctx_exit(int cpu); @@ -33,10 +29,12 @@ static inline int irq_canonicalize(int i #ifdef CONFIG_HOTPLUG_CPU #include +extern int check_irq_vectors_for_cpu_disable(void); extern void fixup_irqs(void); +extern void irq_force_complete_move(int); #endif -extern void (*generic_interrupt_extension)(void); +extern void (*x86_platform_ipi_callback)(void); extern void native_init_IRQ(void); extern bool handle_irq(unsigned irq, struct pt_regs *regs); diff -uprN linux-2.6.32/arch/x86/include/asm/irq_vectors.h linux-2.6.32.ovz/arch/x86/include/asm/irq_vectors.h --- linux-2.6.32/arch/x86/include/asm/irq_vectors.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/irq_vectors.h 2015-06-23 04:16:16.000000000 -0700 @@ -1,6 +1,7 @@ #ifndef _ASM_X86_IRQ_VECTORS_H #define _ASM_X86_IRQ_VECTORS_H +#include /* * Linux IRQ vector layout. * @@ -16,8 +17,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... 237 : device interrupts - * Vectors 238 ... 255 : special interrupts + * Vectors 129 ... 229 : device interrupts + * Vectors 230 ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * @@ -91,34 +92,42 @@ #define THRESHOLD_APIC_VECTOR 0xf9 #define REBOOT_VECTOR 0xf8 -/* f0-f7 used for spreading out TLB flushes: */ -#define INVALIDATE_TLB_VECTOR_END 0xf7 -#define INVALIDATE_TLB_VECTOR_START 0xf0 -#define NUM_INVALIDATE_TLB_VECTORS 8 - /* - * Local APIC timer IRQ vector is on a different priority level, - * to work around the 'lost local interrupt if more than 2 IRQ - * sources per level' errata. + * Generic system vector for platform specific use */ -#define LOCAL_TIMER_VECTOR 0xef +#define X86_PLATFORM_IPI_VECTOR 0xf7 /* - * Generic system vector for platform specific use + * IRQ work vector: */ -#define GENERIC_INTERRUPT_VECTOR 0xed +#define IRQ_WORK_VECTOR 0xf6 + +#define UV_BAU_MESSAGE 0xf5 /* - * Performance monitoring pending work vector: + * Self IPI vector for machine checks */ -#define LOCAL_PENDING_VECTOR 0xec +#define MCE_SELF_VECTOR 0xf4 -#define UV_BAU_MESSAGE 0xec + +/* Vector on which hypervisor callbacks will be delivered */ +#define HYPERVISOR_CALLBACK_VECTOR 0xf3 + +#define MONITOR_IPI_VECTOR 0xf2 +#define MONITOR_POSTED_INTERRUPT_VECTOR 0xf1 /* - * Self IPI vector for machine checks + * Local APIC timer IRQ vector is on a different priority level, + * to work around the 'lost local interrupt if more than 2 IRQ + * sources per level' errata. */ -#define MCE_SELF_VECTOR 0xeb +#define LOCAL_TIMER_VECTOR 0xef + +/* f0-f7 used for spreading out TLB flushes: */ +#define NUM_INVALIDATE_TLB_VECTORS 8 +#define INVALIDATE_TLB_VECTOR_END 0xee +#define INVALIDATE_TLB_VECTOR_START \ + (INVALIDATE_TLB_VECTOR_END - NUM_INVALIDATE_TLB_VECTORS + 1) /* * First APIC vector available to drivers: (vectors 0x30-0xee) we diff -uprN linux-2.6.32/arch/x86/include/asm/k8.h linux-2.6.32.ovz/arch/x86/include/asm/k8.h --- linux-2.6.32/arch/x86/include/asm/k8.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/k8.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,28 +0,0 @@ -#ifndef _ASM_X86_K8_H -#define _ASM_X86_K8_H - -#include - -extern struct pci_device_id k8_nb_ids[]; - -extern int early_is_k8_nb(u32 value); -extern struct pci_dev **k8_northbridges; -extern int num_k8_northbridges; -extern int cache_k8_northbridges(void); -extern void k8_flush_garts(void); -extern int k8_scan_nodes(unsigned long start, unsigned long end); - -#ifdef CONFIG_K8_NB -static inline struct pci_dev *node_to_k8_nb_misc(int node) -{ - return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; -} -#else -static inline struct pci_dev *node_to_k8_nb_misc(int node) -{ - return NULL; -} -#endif - - -#endif /* _ASM_X86_K8_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/Kbuild linux-2.6.32.ovz/arch/x86/include/asm/Kbuild --- linux-2.6.32/arch/x86/include/asm/Kbuild 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/Kbuild 2015-03-10 13:23:49.000000000 -0700 @@ -3,6 +3,7 @@ include include/asm-generic/Kbuild.asm header-y += boot.h header-y += bootparam.h header-y += debugreg.h +header-y += hyperv.h header-y += ldt.h header-y += msr-index.h header-y += prctl.h diff -uprN linux-2.6.32/arch/x86/include/asm/kdebug.h linux-2.6.32.ovz/arch/x86/include/asm/kdebug.h --- linux-2.6.32/arch/x86/include/asm/kdebug.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/kdebug.h 2015-03-10 13:23:38.000000000 -0700 @@ -21,6 +21,8 @@ enum die_val { DIE_NMI_IPI, DIE_PAGE_FAULT, DIE_NMIUNKNOWN, + DIE_NMIMEMPARITY, + DIE_NMIIOCHECK, }; extern void printk_address(unsigned long address, int reliable); @@ -28,10 +30,11 @@ extern void die(const char *, struct pt_ extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_registers(struct pt_regs *regs); extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp, unsigned long bp); + unsigned long *sp); extern void __show_regs(struct pt_regs *regs, int all); extern void show_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); +extern int in_crash_kexec; #endif /* _ASM_X86_KDEBUG_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/kexec.h linux-2.6.32.ovz/arch/x86/include/asm/kexec.h --- linux-2.6.32/arch/x86/include/asm/kexec.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/kexec.h 2015-03-10 13:23:52.000000000 -0700 @@ -23,6 +23,7 @@ #include #include +#include /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. @@ -163,6 +164,9 @@ struct kimage_arch { }; #endif +typedef void crash_vmclear_fn(void); +extern crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss; + #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_KEXEC_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/kprobes.h linux-2.6.32.ovz/arch/x86/include/asm/kprobes.h --- linux-2.6.32/arch/x86/include/asm/kprobes.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/kprobes.h 2015-03-10 13:22:02.000000000 -0700 @@ -24,6 +24,7 @@ #include #include #include +#include #define __ARCH_WANT_KPROBES_INSN_SLOT @@ -32,8 +33,10 @@ struct kprobe; typedef u8 kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xcc -#define RELATIVEJUMP_INSTRUCTION 0xe9 -#define MAX_INSN_SIZE 16 +#define RELATIVEJUMP_OPCODE 0xe9 +#define RELATIVEJUMP_SIZE 5 +#define RELATIVECALL_OPCODE 0xe8 +#define RELATIVE_ADDR_SIZE 4 #define MAX_STACK_SIZE 64 #define MIN_STACK_SIZE(ADDR) \ (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \ @@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t; #define flush_insn_slot(p) do { } while (0) +/* optinsn template addresses */ +extern kprobe_opcode_t optprobe_template_entry; +extern kprobe_opcode_t optprobe_template_val; +extern kprobe_opcode_t optprobe_template_call; +extern kprobe_opcode_t optprobe_template_end; +#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE) +#define MAX_OPTINSN_SIZE \ + (((unsigned long)&optprobe_template_end - \ + (unsigned long)&optprobe_template_entry) + \ + MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE) + extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); @@ -64,6 +78,21 @@ struct arch_specific_insn { int boostable; }; +struct arch_optimized_insn { + /* copy of the original instructions */ + kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE]; + /* detour code buffer */ + kprobe_opcode_t *insn; + /* the size of instructions copied to detour code buffer */ + size_t size; +}; + +/* Return true (!0) if optinsn is prepared for optimization. */ +static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) +{ + return optinsn->size; +} + struct prev_kprobe { struct kprobe *kp; unsigned long status; diff -uprN linux-2.6.32/arch/x86/include/asm/kvm_emulate.h linux-2.6.32.ovz/arch/x86/include/asm/kvm_emulate.h --- linux-2.6.32/arch/x86/include/asm/kvm_emulate.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/kvm_emulate.h 2015-03-10 13:22:52.000000000 -0700 @@ -54,13 +54,23 @@ struct x86_emulate_ctxt; struct x86_emulate_ops { /* * read_std: Read bytes of standard (non-emulated/special) memory. - * Used for instruction fetch, stack operations, and others. + * Used for descriptor reading. * @addr: [IN ] Linear address from which to read. * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu); + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); + + /* + * fetch: Read bytes of standard (non-emulated/special) memory. + * Used for instruction fetch. + * @addr: [IN ] Linear address from which to read. + * @val: [OUT] Value read from memory, zero-extended to 'u_long'. + * @bytes: [IN ] Number of bytes to read from memory. + */ + int (*fetch)(unsigned long addr, void *val, + unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); /* * read_emulated: Read bytes from emulated/special memory area. @@ -98,7 +108,8 @@ struct x86_emulate_ops { const void *new, unsigned int bytes, struct kvm_vcpu *vcpu); - + bool (*get_cpuid)(struct kvm_vcpu *vcpu, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); }; /* Type, address-of, and value of an instruction's operand. */ @@ -129,7 +140,7 @@ struct decode_cache { u8 seg_override; unsigned int d; unsigned long regs[NR_VCPU_REGS]; - unsigned long eip; + unsigned long eip, eip_orig; /* modrm */ u8 modrm; u8 modrm_mod; @@ -168,6 +179,7 @@ struct x86_emulate_ctxt { /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ +#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */ #define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ @@ -179,8 +191,22 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 #endif +/* CPUID vendors */ +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx 0x68747541 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx 0x444d4163 +#define X86EMUL_CPUID_VENDOR_AuthenticAMD_edx 0x69746e65 + +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx 0x69444d41 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx 0x21726574 +#define X86EMUL_CPUID_VENDOR_AMDisbetterI_edx 0x74656273 + +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ebx 0x756e6547 +#define X86EMUL_CPUID_VENDOR_GenuineIntel_ecx 0x6c65746e +#define X86EMUL_CPUID_VENDOR_GenuineIntel_edx 0x49656e69 + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops); + struct x86_emulate_ops *ops, + void *insn, int insn_len); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops); diff -uprN linux-2.6.32/arch/x86/include/asm/kvm.h linux-2.6.32.ovz/arch/x86/include/asm/kvm.h --- linux-2.6.32/arch/x86/include/asm/kvm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/kvm.h 2015-03-10 13:23:56.000000000 -0700 @@ -9,6 +9,22 @@ #include #include +#define DE_VECTOR 0 +#define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 +#define UD_VECTOR 6 +#define NM_VECTOR 7 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 +#define MF_VECTOR 16 +#define MC_VECTOR 18 + /* Select x86 specific features in */ #define __KVM_HAVE_PIT #define __KVM_HAVE_IOAPIC @@ -19,6 +35,9 @@ #define __KVM_HAVE_MSIX #define __KVM_HAVE_MCE #define __KVM_HAVE_PIT_STATE2 +#define __KVM_HAVE_VCPU_EVENTS +#define __KVM_HAVE_XSAVE +#define __KVM_HAVE_XCRS /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 @@ -79,6 +98,7 @@ struct kvm_ioapic_state { #define KVM_IRQCHIP_PIC_MASTER 0 #define KVM_IRQCHIP_PIC_SLAVE 1 #define KVM_IRQCHIP_IOAPIC 2 +#define KVM_NR_IRQCHIPS 3 /* for KVM_GET_REGS and KVM_SET_REGS */ struct kvm_regs { @@ -250,4 +270,55 @@ struct kvm_reinject_control { __u8 pit_reinject; __u8 reserved[31]; }; + +/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */ +#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001 +#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002 + +/* for KVM_GET/SET_VCPU_EVENTS */ +struct kvm_vcpu_events { + struct { + __u8 injected; + __u8 nr; + __u8 has_error_code; + __u8 pad; + __u32 error_code; + } exception; + struct { + __u8 injected; + __u8 nr; + __u8 soft; + __u8 pad; + } interrupt; + struct { + __u8 injected; + __u8 pending; + __u8 masked; + __u8 pad; + } nmi; + __u32 sipi_vector; + __u32 flags; + __u32 reserved[10]; +}; + +/* for KVM_CAP_XSAVE */ +struct kvm_xsave { + __u32 region[1024]; +}; + +#define KVM_MAX_XCRS 16 + +struct kvm_xcr { + __u32 xcr; + __u32 reserved; + __u64 value; +}; + +struct kvm_xcrs { + __u32 nr_xcrs; + __u32 flags; + struct kvm_xcr xcrs[KVM_MAX_XCRS]; + __u64 padding[16]; +}; + #endif /* _ASM_X86_KVM_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/kvm_host.h linux-2.6.32.ovz/arch/x86/include/asm/kvm_host.h --- linux-2.6.32/arch/x86/include/asm/kvm_host.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/kvm_host.h 2015-03-10 13:24:14.000000000 -0700 @@ -15,17 +15,31 @@ #include #include #include +#include #include #include #include +#include #include #include #include #include -#define KVM_MAX_VCPUS 16 +/* KVM_MAX_VCPU_COUNT is RHEL6-only. Upstream uses KVM_MAX_VCPUS. + * The macro was renamed because upstream code has the implicit assumption that + * vcpu_id < KVM_MAX_VCPUS for all VCPUs, while on RHEL-6 we allow + * vcpu_id >= KVM_MAX_VCPUS on x86. This way we avoid inadvertently introducing + * security bugs while backporting upstream code. + */ +#define KVM_MAX_VCPU_COUNT 160 + +/* Limit for vcpu_id for all VCPUs. RHEL-specific, too (see above). + * KVM_CREATE_VCPU code ensures that all VCPUs have vcpu_id < KVM_MAX_VCPU_ID. + */ +#define KVM_MAX_VCPU_ID 255 + #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 @@ -35,6 +49,7 @@ #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) +#define CR3_PCID_ENABLED_RESERVED_BITS 0xFFFFFF0000000000ULL #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) @@ -43,11 +58,12 @@ #define KVM_GUEST_CR0_MASK \ (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \ - (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP) + (X86_CR0_WP | X86_CR0_NE | X86_CR0_MP) #define KVM_VM_CR0_ALWAYS_ON \ (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_GUEST_CR4_MASK \ - (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE) + (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE \ + | X86_CR4_OSXSAVE | X86_CR4_PCIDE) #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) #define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE) @@ -61,22 +77,6 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -#define DE_VECTOR 0 -#define DB_VECTOR 1 -#define BP_VECTOR 3 -#define OF_VECTOR 4 -#define BR_VECTOR 5 -#define UD_VECTOR 6 -#define NM_VECTOR 7 -#define DF_VECTOR 8 -#define TS_VECTOR 10 -#define NP_VECTOR 11 -#define SS_VECTOR 12 -#define GP_VECTOR 13 -#define PF_VECTOR 14 -#define MF_VECTOR 16 -#define MC_VECTOR 18 - #define SELECTOR_TI_MASK (1 << 2) #define SELECTOR_RPL_MASK 0x03 @@ -90,7 +90,7 @@ #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 -#define KVM_MAX_CPUID_ENTRIES 40 +#define KVM_MAX_CPUID_ENTRIES 80 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 @@ -155,6 +155,16 @@ enum { #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff23ff +/* apic attention bits */ +#define KVM_APIC_CHECK_VAPIC 0 +/* + * The following bit is set with PV-EOI, unset on EOI. + * We detect PV-EOI changes by guest by comparing + * this bit with PV-EOI in guest memory. + * See the implementation in apic_update_pv_eoi. + */ +#define KVM_APIC_PV_EOI_PENDING 1 + /* * We don't want allocation failures within the mmu code, so we preallocate * enough memory for a single page fault in a cache. @@ -193,6 +203,8 @@ union kvm_mmu_page_role { unsigned invalid:1; unsigned cr4_pge:1; unsigned nxe:1; + unsigned cr0_wp:1; + unsigned smep_andnot_wp:1; }; }; @@ -256,7 +268,8 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); void (*free)(struct kvm_vcpu *vcpu); - gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); + gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, + u32 *error); void (*prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page); int (*sync_page)(struct kvm_vcpu *vcpu, @@ -271,8 +284,38 @@ struct kvm_mmu { u64 rsvd_bits_mask[2][4]; }; +enum pmc_type { + KVM_PMC_GP = 0, + KVM_PMC_FIXED, +}; + +struct kvm_pmc { + enum pmc_type type; + u8 idx; + u64 counter; + u64 eventsel; + struct perf_event *perf_event; + struct kvm_vcpu *vcpu; +}; + +struct kvm_pmu { + unsigned nr_arch_gp_counters; + unsigned nr_arch_fixed_counters; + unsigned available_event_types; + u64 fixed_ctr_ctrl; + u64 global_ctrl; + u64 global_status; + u64 global_ovf_ctrl; + u64 counter_bitmask[2]; + u64 global_ctrl_mask; + u8 version; + struct kvm_pmc gp_counters[INTEL_PMC_MAX_GENERIC]; + struct kvm_pmc fixed_counters[INTEL_PMC_MAX_FIXED]; + struct irq_work irq_work; + u64 reprogram_pmi; +}; + struct kvm_vcpu_arch { - u64 host_tsc; /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. @@ -282,15 +325,18 @@ struct kvm_vcpu_arch { u32 regs_dirty; unsigned long cr0; + unsigned long cr0_guest_owned_bits; unsigned long cr2; unsigned long cr3; unsigned long cr4; + unsigned long cr4_guest_owned_bits; unsigned long cr8; u32 hflags; u64 pdptrs[4]; /* pae */ u64 shadow_efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ + unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; int sipi_vector; @@ -318,8 +364,8 @@ struct kvm_vcpu_arch { unsigned long mmu_seq; } update_pte; - struct i387_fxsave_struct host_fx_image; - struct i387_fxsave_struct guest_fx_image; + struct fpu guest_fpu; + u64 xcr0; gva_t mmio_fault_cr2; struct kvm_pio_request pio; @@ -350,9 +396,30 @@ struct kvm_vcpu_arch { gpa_t time; struct pvclock_vcpu_time_info hv_clock; - unsigned int hv_clock_tsc_khz; - unsigned int time_offset; - struct page *time_page; + unsigned int hw_tsc_khz; + struct gfn_to_hva_cache pv_time; + bool pv_time_enabled; + /* set guest stopped flag in pvclock flags field */ + bool pvclock_set_guest_stopped_request; + + struct { + u64 msr_val; + u64 last_steal; + u64 accum_steal; + gpa_t stime; + struct kvm_steal_time steal; + } st; + + u64 last_host_tsc; + u64 last_guest_tsc; + u64 last_kernel_ns; + u64 last_tsc_nsec; + u64 last_tsc_write; + u64 tsc_offset_adjustment; + u32 virtual_tsc_khz; + bool tsc_catchup; + u32 tsc_catchup_mult; + s8 tsc_catchup_shift; bool singlestep; /* guest is single stepped by KVM */ bool nmi_pending; @@ -371,21 +438,36 @@ struct kvm_vcpu_arch { u64 mcg_status; u64 mcg_ctl; u64 *mce_banks; + + struct kvm_pmu pmu; + + struct { + u64 msr_val; + struct gfn_to_hva_cache data; + } pv_eoi; }; struct kvm_mem_alias { gfn_t base_gfn; unsigned long npages; gfn_t target_gfn; +#define KVM_ALIAS_INVALID 1UL + unsigned long flags; }; -struct kvm_arch{ - int naliases; +#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION + +struct kvm_mem_aliases { struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; + int naliases; +}; + +struct kvm_arch { + struct kvm_mem_aliases *aliases; - unsigned int n_free_mmu_pages; + unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; - unsigned int n_alloc_mmu_pages; + unsigned int n_max_mmu_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* * Hash table of struct kvm_mmu_page. @@ -397,7 +479,6 @@ struct kvm_arch{ struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; - struct hlist_head irq_ack_notifier_list; int vapics_in_nmi_mode; unsigned int tss_addr; @@ -410,8 +491,17 @@ struct kvm_arch{ gpa_t ept_identity_map_addr; unsigned long irq_sources_bitmap; - unsigned long irq_states[KVM_IOAPIC_NUM_PINS]; - u64 vm_init_tsc; + s64 kvmclock_offset; + spinlock_t tsc_write_lock; + u64 last_tsc_nsec; + u64 last_tsc_offset; + u64 last_tsc_write; + struct delayed_work kvmclock_update_work; + struct delayed_work kvmclock_sync_work; + + /* fields used by HYPER-V emulation */ + u64 hv_guest_os_id; + u64 hv_hypercall; }; struct kvm_vm_stat { @@ -458,15 +548,22 @@ struct descriptor_table { unsigned long base; } __attribute__((packed)); +struct msr_data { + bool host_initiated; + u32 index; + u64 data; +}; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ - void (*hardware_enable)(void *dummy); /* __init */ + int (*hardware_enable)(void *dummy); void (*hardware_disable)(void *dummy); void (*check_processor_compatibility)(void *rtn); int (*hardware_setup)(void); /* __init */ void (*hardware_unsetup)(void); /* __exit */ bool (*cpu_has_accelerated_tpr)(void); + void (*cpuid_update)(struct kvm_vcpu *vcpu); /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); @@ -480,7 +577,7 @@ struct kvm_x86_ops { int (*set_guest_debug)(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg); int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); + int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -488,6 +585,7 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); + void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -503,11 +601,12 @@ struct kvm_x86_ops { void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); + void (*fpu_deactivate)(struct kvm_vcpu *vcpu); void (*tlb_flush)(struct kvm_vcpu *vcpu); - void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); - int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); + void (*run)(struct kvm_vcpu *vcpu); + int (*handle_exit)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); @@ -519,19 +618,38 @@ struct kvm_x86_ops { bool has_error_code, u32 error_code); int (*interrupt_allowed)(struct kvm_vcpu *vcpu); int (*nmi_allowed)(struct kvm_vcpu *vcpu); + bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); + void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); - bool (*gb_page_enable)(void); + int (*get_lpage_level)(void); + bool (*invpcid_supported)(void); - const struct trace_print_flags *exit_reasons_str; + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); + void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); + u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); + + void (*sched_in)(struct kvm_vcpu *kvm, int cpu); }; extern struct kvm_x86_ops *kvm_x86_ops; +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -544,7 +662,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask u64 dirty_mask, u64 nx_mask, u64 x_mask); int kvm_mmu_reset_context(struct kvm_vcpu *vcpu); -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot); +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot, bool destroy_large_pages); void kvm_mmu_zap_all(struct kvm *kvm); unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); @@ -559,6 +677,15 @@ u8 kvm_get_guest_memory_type(struct kvm_ extern bool tdp_enabled; +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); + +/* control of guest tsc rate supported? */ +extern bool kvm_has_tsc_control; +/* minimum supported tsc_khz for guests */ +extern u32 kvm_min_guest_tsc_khz; +/* maximum supported tsc_khz for guests */ +extern u32 kvm_max_guest_tsc_khz; + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ @@ -568,8 +695,16 @@ enum emulation_result { #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) -int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, - unsigned long cr2, u16 error_code, int emulation_type); +int x86_emulate_instruction(struct kvm_vcpu *vcpu, + unsigned long cr2, int emulation_type, + void *insn, int insn_len); + +static inline int emulate_instruction(struct kvm_vcpu *vcpu, + int emulation_type) +{ + return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); +} + void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); @@ -577,17 +712,17 @@ void realmode_lmsw(struct kvm_vcpu *vcpu unsigned long *rflags); unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr); -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, +int realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value, unsigned long *rflags); void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; -int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, int size, unsigned long count, int down, gva_t address, int rep, unsigned port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); @@ -600,21 +735,24 @@ int emulator_set_dr(struct x86_emulate_c unsigned long value); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg); +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code); -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); +int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l); +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr); int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); + +bool kvm_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); @@ -622,7 +760,20 @@ void kvm_inject_page_fault(struct kvm_vc u32 error_code); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); -int kvm_pic_set_irq(void *opaque, int irq, int level); +static inline int __kvm_irq_line_state(unsigned long *irq_state, + int irq_source_id, int level) +{ + /* Logical OR for level trig interrupt */ + if (level) + __set_bit(irq_source_id, irq_state); + else + __clear_bit(irq_source_id, irq_state); + + return !!(*irq_state); +} + +int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); +void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); @@ -644,12 +795,17 @@ void __kvm_mmu_free_some_pages(struct kv int kvm_mmu_load(struct kvm_vcpu *vcpu); void kvm_mmu_unload(struct kvm_vcpu *vcpu); void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_fix_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code); +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, + void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_enable_tdp(void); @@ -657,6 +813,7 @@ void kvm_disable_tdp(void); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int complete_pio(struct kvm_vcpu *vcpu); +bool kvm_check_iopl(struct kvm_vcpu *vcpu); struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); @@ -667,20 +824,6 @@ static inline struct kvm_mmu_page *page_ return (struct kvm_mmu_page *)page_private(page); } -static inline u16 kvm_read_fs(void) -{ - u16 seg; - asm("mov %%fs, %0" : "=g"(seg)); - return seg; -} - -static inline u16 kvm_read_gs(void) -{ - u16 seg; - asm("mov %%gs, %0" : "=g"(seg)); - return seg; -} - static inline u16 kvm_read_ldt(void) { u16 ldt; @@ -688,16 +831,6 @@ static inline u16 kvm_read_ldt(void) return ldt; } -static inline void kvm_load_fs(u16 sel) -{ - asm("mov %0, %%fs" : : "rm"(sel)); -} - -static inline void kvm_load_gs(u16 sel) -{ - asm("mov %0, %%gs" : : "rm"(sel)); -} - static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); @@ -730,21 +863,6 @@ static inline unsigned long read_msr(uns } #endif -static inline void kvm_fx_save(struct i387_fxsave_struct *image) -{ - asm("fxsave (%0)":: "r" (image)); -} - -static inline void kvm_fx_restore(struct i387_fxsave_struct *image) -{ - asm("fxrstor (%0)":: "r" (image)); -} - -static inline void kvm_fx_finit(void) -{ - asm("finit"); -} - static inline u32 get_rdx_init_val(void) { return 0x600; /* P6 family */ @@ -780,14 +898,18 @@ enum { * reboot turns off virtualization while processes are running. * Trap the fault and ignore the instruction if that happens. */ -asmlinkage void kvm_handle_fault_on_reboot(void); +asmlinkage void kvm_spurious_fault(void); +extern bool kvm_rebooting; #define __kvm_handle_fault_on_reboot(insn) \ "666: " insn "\n\t" \ + "668: \n\t" \ ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ + "cmpb $0, kvm_rebooting \n\t" \ + "jne 668b \n\t" \ __ASM_SIZE(push) " $666b \n\t" \ - "jmp kvm_handle_fault_on_reboot \n\t" \ + "call kvm_spurious_fault \n\t" \ ".popsection \n\t" \ ".pushsection __ex_table, \"a\" \n\t" \ _ASM_PTR " 666b, 667b \n\t" \ @@ -802,4 +924,20 @@ int kvm_cpu_has_interrupt(struct kvm_vcp int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); +void kvm_define_shared_msr(unsigned index, u32 msr); +void kvm_set_shared_msr(unsigned index, u64 val, u64 mask); + +int kvm_is_in_guest(void); + +void kvm_pmu_init(struct kvm_vcpu *vcpu); +void kvm_pmu_destroy(struct kvm_vcpu *vcpu); +void kvm_pmu_reset(struct kvm_vcpu *vcpu); +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); +void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); +void kvm_deliver_pmi(struct kvm_vcpu *vcpu); + #endif /* _ASM_X86_KVM_HOST_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/kvm_para.h linux-2.6.32.ovz/arch/x86/include/asm/kvm_para.h --- linux-2.6.32/arch/x86/include/asm/kvm_para.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/kvm_para.h 2015-03-10 13:24:03.000000000 -0700 @@ -2,7 +2,7 @@ #define _ASM_X86_KVM_PARA_H #include - +#include /* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It * should be used to determine that a VM is running under KVM. */ @@ -15,10 +15,39 @@ #define KVM_FEATURE_CLOCKSOURCE 0 #define KVM_FEATURE_NOP_IO_DELAY 1 #define KVM_FEATURE_MMU_OP 2 +/* This indicates that the new set of kvmclock msrs + * are available. The use of 0x11 and 0x12 is deprecated + */ +#define KVM_FEATURE_CLOCKSOURCE2 3 +#define KVM_FEATURE_STEAL_TIME 5 +#define KVM_FEATURE_PV_EOI 6 + +/* The last 8 bits are used to indicate how to interpret the flags field + * in pvclock structure. If no bits are set, all flags are ignored. + */ +#define KVM_FEATURE_CLOCKSOURCE_STABLE_BIT 24 #define MSR_KVM_WALL_CLOCK 0x11 #define MSR_KVM_SYSTEM_TIME 0x12 +#define KVM_MSR_ENABLED 1 +/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ +#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 +#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 +#define MSR_KVM_STEAL_TIME 0x4b564d03 +#define MSR_KVM_PV_EOI_EN 0x4b564d04 + +struct kvm_steal_time { + __u64 steal; + __u32 version; + __u32 flags; + __u32 pad[12]; +}; + +#define KVM_STEAL_ALIGNMENT_BITS 5 +#define KVM_STEAL_VALID_BITS ((-1ULL << (KVM_STEAL_ALIGNMENT_BITS + 1))) +#define KVM_STEAL_RESERVED_MASK (((1 << KVM_STEAL_ALIGNMENT_BITS) - 1 ) << 1) + #define KVM_MAX_MMU_OP_BATCH 32 /* Operations for KVM_HC_MMU_OP */ @@ -47,11 +76,26 @@ struct kvm_mmu_op_release_pt { __u64 pt_phys; }; +#define KVM_PV_EOI_BIT 0 +#define KVM_PV_EOI_MASK (0x1 << KVM_PV_EOI_BIT) +#define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK +#define KVM_PV_EOI_DISABLED 0x0 + #ifdef __KERNEL__ #include extern void kvmclock_init(void); +extern void kvm_disable_steal_time(void); +extern int kvm_register_clock(char *txt); +#ifdef CONFIG_KVM_CLOCK +bool kvm_check_and_clear_guest_paused(void); +#else +static inline bool kvm_check_and_clear_guest_paused(void) +{ + return false; +} +#endif /* CONFIG_KVMCLOCK */ /* This instruction is vmcall. On non-VT architectures, it will generate a * trap that we will then rewrite to the appropriate instruction. @@ -122,21 +166,17 @@ static inline long kvm_hypercall4(unsign return ret; } -static inline int kvm_para_available(void) +static inline uint32_t kvm_cpuid_base(void) { - unsigned int eax, ebx, ecx, edx; - char signature[13]; + if (boot_cpu_data.cpuid_level < 0) + return 0; /* So we don't blow up on old processors */ - cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx); - memcpy(signature + 0, &ebx, 4); - memcpy(signature + 4, &ecx, 4); - memcpy(signature + 8, &edx, 4); - signature[12] = 0; - - if (strcmp(signature, "KVMKVMKVM") == 0) - return 1; + return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); +} - return 0; +static inline bool kvm_para_available(void) +{ + return kvm_cpuid_base() != 0; } static inline unsigned int kvm_arch_para_features(void) diff -uprN linux-2.6.32/arch/x86/include/asm/linkage.h linux-2.6.32.ovz/arch/x86/include/asm/linkage.h --- linux-2.6.32/arch/x86/include/asm/linkage.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/linkage.h 2015-06-23 04:16:16.000000000 -0700 @@ -8,11 +8,6 @@ #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) -/* - * For 32-bit UML - mark functions implemented in assembly that use - * regparm input parameters: - */ -#define asmregparm __attribute__((regparm(3))) /* * Make sure the compiler doesn't do anything stupid with the diff -uprN linux-2.6.32/arch/x86/include/asm/local64.h linux-2.6.32.ovz/arch/x86/include/asm/local64.h --- linux-2.6.32/arch/x86/include/asm/local64.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/local64.h 2015-03-10 13:22:06.000000000 -0700 @@ -0,0 +1 @@ +#include diff -uprN linux-2.6.32/arch/x86/include/asm/mce.h linux-2.6.32.ovz/arch/x86/include/asm/mce.h --- linux-2.6.32/arch/x86/include/asm/mce.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mce.h 2015-03-10 13:24:14.000000000 -0700 @@ -8,6 +8,7 @@ * Machine Check support for x86 */ +/* MCG_CAP register defines */ #define MCG_BANKCNT_MASK 0xff /* Number of Banks */ #define MCG_CTL_P (1ULL<<8) /* MCG_CTL register available */ #define MCG_EXT_P (1ULL<<9) /* Extended registers available */ @@ -16,11 +17,14 @@ #define MCG_EXT_CNT_SHIFT 16 #define MCG_EXT_CNT(c) (((c) & MCG_EXT_CNT_MASK) >> MCG_EXT_CNT_SHIFT) #define MCG_SER_P (1ULL<<24) /* MCA recovery/new status bits */ +#define MCG_ELOG_P (1ULL<<26) /* Extended error log supported */ +/* MCG_STATUS register defines */ #define MCG_STATUS_RIPV (1ULL<<0) /* restart ip valid */ #define MCG_STATUS_EIPV (1ULL<<1) /* ip points to correct instruction */ #define MCG_STATUS_MCIP (1ULL<<2) /* machine check in progress */ +/* MCi_STATUS register defines */ #define MCI_STATUS_VAL (1ULL<<63) /* valid error */ #define MCI_STATUS_OVER (1ULL<<62) /* previous errors lost */ #define MCI_STATUS_UC (1ULL<<61) /* uncorrected error */ @@ -31,12 +35,35 @@ #define MCI_STATUS_S (1ULL<<56) /* Signaled machine check */ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ -/* MISC register defines */ -#define MCM_ADDR_SEGOFF 0 /* segment offset */ -#define MCM_ADDR_LINEAR 1 /* linear address */ -#define MCM_ADDR_PHYS 2 /* physical address */ -#define MCM_ADDR_MEM 3 /* memory address */ -#define MCM_ADDR_GENERIC 7 /* generic */ +/* + * Note that the full MCACOD field of IA32_MCi_STATUS MSR is + * bits 15:0. But bit 12 is the 'F' bit, defined for corrected + * errors to indicate that errors are being filtered by hardware. + * We should mask out bit 12 when looking for specific signatures + * of uncorrected errors - so the F bit is deliberately skipped + * in this #define. + */ +#define MCACOD 0xefff /* MCA Error Code */ + +/* Architecturally defined codes from SDM Vol. 3B Chapter 15 */ +#define MCACOD_SCRUB 0x00C0 /* 0xC0-0xCF Memory Scrubbing */ +#define MCACOD_SCRUBMSK 0xeff0 /* Skip bit 12 ('F' bit) */ +#define MCACOD_L3WB 0x017A /* L3 Explicit Writeback */ +#define MCACOD_DATA 0x0134 /* Data Load */ +#define MCACOD_INSTR 0x0150 /* Instruction Fetch */ + +/* MCi_MISC register defines */ +#define MCI_MISC_ADDR_LSB(m) ((m) & 0x3f) +#define MCI_MISC_ADDR_MODE(m) (((m) >> 6) & 7) +#define MCI_MISC_ADDR_SEGOFF 0 /* segment offset */ +#define MCI_MISC_ADDR_LINEAR 1 /* linear address */ +#define MCI_MISC_ADDR_PHYS 2 /* physical address */ +#define MCI_MISC_ADDR_MEM 3 /* memory address */ +#define MCI_MISC_ADDR_GENERIC 7 /* generic */ + +/* CTL2 register defines */ +#define MCI_CTL2_CMCI_EN (1ULL << 30) +#define MCI_CTL2_CMCI_THRESHOLD_MASK 0x7fffULL #define MCJ_CTX_MASK 3 #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) @@ -108,8 +135,18 @@ struct mce_log { #define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) #define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) + #ifdef __KERNEL__ +struct mca_config { + bool dont_log_ce; + u8 banks; + int tolerant; +}; + +extern void mce_register_decode_chain(struct notifier_block *nb); +extern void mce_unregister_decode_chain(struct notifier_block *nb); + #include #include #include @@ -118,9 +155,11 @@ extern int mce_disabled; extern int mce_p5_enabled; #ifdef CONFIG_X86_MCE -void mcheck_init(struct cpuinfo_x86 *c); +int mcheck_init(void); +void mcheck_cpu_init(struct cpuinfo_x86 *c); #else -static inline void mcheck_init(struct cpuinfo_x86 *c) {} +static inline int mcheck_init(void) { return 0; } +static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {} #endif #ifdef CONFIG_X86_ANCIENT_MCE @@ -133,8 +172,6 @@ static inline void winchip_mcheck_init(s static inline void enable_p5_mce(void) {} #endif -extern void (*x86_mce_decode_callback)(struct mce *m); - void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); @@ -149,6 +186,7 @@ DECLARE_PER_CPU(struct sys_device, mce_d #ifdef CONFIG_X86_MCE_INTEL extern int mce_cmci_disabled; extern int mce_ignore_ce; +extern int mce_bios_cmci_threshold; void mce_intel_feature_init(struct cpuinfo_x86 *c); void cmci_clear(void); void cmci_reenable(void); @@ -191,6 +229,9 @@ void mce_notify_process(void); DECLARE_PER_CPU(struct mce, injectm); extern struct file_operations mce_chrdev_ops; +/* Disable CMCI/polling for MCA bank claimed by firmware */ +extern void mce_disable_bank(int bank); + /* * Exception handler */ @@ -214,5 +255,19 @@ void intel_init_thermal(struct cpuinfo_x void mce_log_therm_throt_event(__u64 status); +#ifdef CONFIG_X86_THERMAL_VECTOR +extern void mcheck_intel_therm_init(void); +#else +static inline void mcheck_intel_therm_init(void) { } +#endif + +/* + * Used by APEI to report memory error via /dev/mcelog + */ + +struct cper_sec_mem_err; +extern void apei_mce_report_mem_error(int corrected, + struct cper_sec_mem_err *mem_err); + #endif /* __KERNEL__ */ #endif /* _ASM_X86_MCE_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/microcode.h linux-2.6.32.ovz/arch/x86/include/asm/microcode.h --- linux-2.6.32/arch/x86/include/asm/microcode.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/microcode.h 2015-03-10 13:21:02.000000000 -0700 @@ -12,6 +12,8 @@ struct device; enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; struct microcode_ops { + void (*init)(struct device *device); + void (*fini)(void); enum ucode_state (*request_microcode_user) (int cpu, const void __user *buf, size_t size); diff -uprN linux-2.6.32/arch/x86/include/asm/mm_track.h linux-2.6.32.ovz/arch/x86/include/asm/mm_track.h --- linux-2.6.32/arch/x86/include/asm/mm_track.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mm_track.h 2015-03-10 13:21:12.000000000 -0700 @@ -0,0 +1,98 @@ +/* + * Routines and structures for building a bitmap of + * dirty pages in a live system. For use in memory mirroring + * or migration applications. + * + * Copyright (C) 2006, 2010 Stratus Technologies Bermuda Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef __X86_64_MMTRACK_H__ +#define __X86_64_MMTRACK_H__ + +#ifndef CONFIG_TRACK_DIRTY_PAGES + +static inline void mm_track_pte(pte_t *ptep) {} +static inline void mm_track_pmd(pmd_t *pmdp) {} +static inline void mm_track_pud(pud_t *pudp) {} +static inline void mm_track_pgd(pgd_t *pgdp) {} +static inline void mm_track_phys(void *physp) {} + +#else + +#include +#include + /* + * For memory-tracking purposes, if active is true (non-zero), the other + * elements of the structure are available for use. Each time mm_track_pte + * is called, it increments count and sets a bit in the bitvector table. + * Each bit in the bitvector represents a physical page in memory. + * + * This is declared in arch/x86_64/mm/track.c. + * + * The in_use element is used in the code which drives the memory tracking + * environment. When tracking is complete, the vector may be freed, but + * only after the active flag is set to zero and the in_use count goes to + * zero. + * + * The count element indicates how many pages have been stored in the + * bitvector. This is an optimization to avoid counting the bits in the + * vector between harvest operations. + */ +struct mm_tracker { + int active; /* non-zero if this structure in use */ + atomic_t count; /* number of pages tracked by mm_track() */ + unsigned long *vector; /* bit vector of modified pages */ + unsigned long bitcnt; /* number of bits in vector */ +}; +extern struct mm_tracker mm_tracking_struct; + +extern void do_mm_track_pte(void *); +extern void do_mm_track_pmd(void *); +extern void do_mm_track_pud(void *); +extern void do_mm_track_pgd(void *); +extern void do_mm_track_phys(void *); + +/* + * The mm_track routine is needed by macros in pgtable.h + */ +static inline void mm_track_pte(pte_t *ptep) +{ + if (unlikely(mm_tracking_struct.active)) + do_mm_track_pte(ptep); +} +static inline void mm_track_pmd(pmd_t *pmdp) +{ + if (unlikely(mm_tracking_struct.active)) + do_mm_track_pmd(pmdp); +} +static inline void mm_track_pud(pud_t *pudp) +{ + if (unlikely(mm_tracking_struct.active)) + do_mm_track_pud(pudp); +} +static inline void mm_track_pgd(pgd_t *pgdp) +{ + if (unlikely(mm_tracking_struct.active)) + do_mm_track_pgd(pgdp); +} +static inline void mm_track_phys(void *physp) +{ + if (unlikely(mm_tracking_struct.active)) + do_mm_track_phys(physp); +} +#endif /* CONFIG_TRACK_DIRTY_PAGES */ + +#endif /* __X86_64_MMTRACK_H__ */ diff -uprN linux-2.6.32/arch/x86/include/asm/mmu_context.h linux-2.6.32.ovz/arch/x86/include/asm/mmu_context.h --- linux-2.6.32/arch/x86/include/asm/mmu_context.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mmu_context.h 2015-03-10 13:24:06.000000000 -0700 @@ -36,8 +36,6 @@ static inline void switch_mm(struct mm_s unsigned cpu = smp_processor_id(); if (likely(prev != next)) { - /* stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); #ifdef CONFIG_SMP percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); @@ -47,19 +45,28 @@ static inline void switch_mm(struct mm_s /* Re-load page tables */ load_cr3(next->pgd); - /* - * load the LDT, if the LDT is different: - */ + /* Stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + + /* Load the LDT, if the LDT is different: */ if (unlikely(prev->context.ldt != next->context.ldt)) load_LDT_nolock(&next->context); } #ifdef CONFIG_SMP - else { + else { percpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { - /* We were in lazy tlb mode and leave_mm disabled + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + /* + * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. */ diff -uprN linux-2.6.32/arch/x86/include/asm/mmu.h linux-2.6.32.ovz/arch/x86/include/asm/mmu.h --- linux-2.6.32/arch/x86/include/asm/mmu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mmu.h 2015-03-10 13:20:57.000000000 -0700 @@ -7,12 +7,19 @@ /* * The x86 doesn't have a mmu context, but * we put the segment information here. + * + * exec_limit is used to track the range PROT_EXEC + * mappings span. */ typedef struct { void *ldt; int size; struct mutex lock; void *vdso; +#ifdef CONFIG_X86_32 + struct desc_struct user_cs; + unsigned long exec_limit; +#endif } mm_context_t; #ifdef CONFIG_SMP diff -uprN linux-2.6.32/arch/x86/include/asm/mmzone_32.h linux-2.6.32.ovz/arch/x86/include/asm/mmzone_32.h --- linux-2.6.32/arch/x86/include/asm/mmzone_32.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mmzone_32.h 2015-03-10 13:22:35.000000000 -0700 @@ -68,17 +68,6 @@ static inline int pfn_to_nid(unsigned lo #endif } -/* - * Following are macros that each numa implmentation must define. - */ - -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) \ -({ \ - pg_data_t *__pgdat = NODE_DATA(nid); \ - __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \ -}) - static inline int pfn_valid(int pfn) { int nid = pfn_to_nid(pfn); diff -uprN linux-2.6.32/arch/x86/include/asm/mmzone_64.h linux-2.6.32.ovz/arch/x86/include/asm/mmzone_64.h --- linux-2.6.32/arch/x86/include/asm/mmzone_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mmzone_64.h 2015-03-10 13:22:35.000000000 -0700 @@ -36,10 +36,6 @@ static inline __attribute__((pure)) int #define NODE_DATA(nid) (node_data[nid]) -#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) -#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ - NODE_DATA(nid)->node_spanned_pages) - #ifdef CONFIG_NUMA_EMU #define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024) #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL)) diff -uprN linux-2.6.32/arch/x86/include/asm/mpspec.h linux-2.6.32.ovz/arch/x86/include/asm/mpspec.h --- linux-2.6.32/arch/x86/include/asm/mpspec.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mpspec.h 2015-03-10 13:22:27.000000000 -0700 @@ -6,7 +6,7 @@ #include #include -extern int apic_version[MAX_APICS]; +extern int apic_version[]; extern int pic_mode; #ifdef CONFIG_X86_32 diff -uprN linux-2.6.32/arch/x86/include/asm/mshyperv.h linux-2.6.32.ovz/arch/x86/include/asm/mshyperv.h --- linux-2.6.32/arch/x86/include/asm/mshyperv.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mshyperv.h 2015-03-10 13:24:02.000000000 -0700 @@ -0,0 +1,21 @@ +#ifndef _ASM_X86_MSHYPER_H +#define _ASM_X86_MSHYPER_H + +#include +#include + +struct ms_hyperv_info { + u32 features; + u32 hints; +}; + +extern struct ms_hyperv_info ms_hyperv; + +void hyperv_callback_vector(void); +#ifdef CONFIG_TRACING +#define trace_hyperv_callback_vector hyperv_callback_vector +#endif +void hyperv_vector_handler(struct pt_regs *regs); +void hv_register_vmbus_handler(int irq, irq_handler_t handler); + +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/msr.h linux-2.6.32.ovz/arch/x86/include/asm/msr.h --- linux-2.6.32/arch/x86/include/asm/msr.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/msr.h 2015-03-10 13:24:14.000000000 -0700 @@ -27,6 +27,18 @@ struct msr { }; }; +struct msr_info { + u32 msr_no; + struct msr reg; + struct msr *msrs; + int err; +}; + +struct msr_regs_info { + u32 *regs; + int err; +}; + static inline unsigned long long native_read_tscp(unsigned int *aux) { unsigned long low, high; @@ -225,6 +237,8 @@ do { \ (high) = (u32)(_l >> 32); \ } while (0) +#define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) + #define rdtscp(low, high, aux) \ do { \ unsigned long long _val = native_read_tscp(&(aux)); \ @@ -244,13 +258,20 @@ do { #define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) +struct msr *msrs_alloc(void); +void msrs_free(struct msr *msrs); + #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); -void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); -void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); +int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); +int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); +int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); #else /* CONFIG_SMP */ @@ -264,6 +285,16 @@ static inline int wrmsr_on_cpu(unsigned wrmsr(msr_no, l, h); return 0; } +static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + rdmsrl(msr_no, *q); + return 0; +} +static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + wrmsrl(msr_no, q); + return 0; +} static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no, struct msr *msrs) { @@ -283,6 +314,14 @@ static inline int wrmsr_safe_on_cpu(unsi { return wrmsr_safe(msr_no, l, h); } +static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + return rdmsrl_safe(msr_no, q); +} +static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + return wrmsrl_safe(msr_no, q); +} static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { return rdmsr_safe_regs(regs); diff -uprN linux-2.6.32/arch/x86/include/asm/msr-index.h linux-2.6.32.ovz/arch/x86/include/asm/msr-index.h --- linux-2.6.32/arch/x86/include/asm/msr-index.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/msr-index.h 2015-06-23 04:16:16.000000000 -0700 @@ -12,6 +12,7 @@ #define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ #define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ #define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ +#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */ /* EFER bits: */ #define _EFER_SCE 0 /* SYSCALL/SYSRET */ @@ -32,9 +33,21 @@ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd +#define MSR_NHM_PLATFORM_INFO 0x000000ce +#define SNB_C1_AUTO_UNDEMOTE (1UL << 27) +#define SNB_C3_AUTO_UNDEMOTE (1UL << 28) + +#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 +#define NHM_C3_AUTO_DEMOTE (1UL << 25) +#define NHM_C1_AUTO_DEMOTE (1UL << 26) +#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25) + +#define MSR_PLATFORM_INFO 0x000000ce #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 +#define MSR_IA32_BBL_CR_CTL3 0x0000011e +#define MSR_MISC_FEATURES_ENABLES 0x00000140 #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 @@ -44,9 +57,22 @@ #define MSR_IA32_MCG_STATUS 0x0000017a #define MSR_IA32_MCG_CTL 0x0000017b +#define MSR_OFFCORE_RSP_0 0x000001a6 +#define MSR_OFFCORE_RSP_1 0x000001a7 +#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad +#define MSR_IVT_TURBO_RATIO_LIMIT 0x000001ae + +#define MSR_LBR_SELECT 0x000001c8 +#define MSR_LBR_TOS 0x000001c9 +#define MSR_LBR_NHM_FROM 0x00000680 +#define MSR_LBR_NHM_TO 0x000006c0 +#define MSR_LBR_CORE_FROM 0x00000040 +#define MSR_LBR_CORE_TO 0x00000060 + #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_MTRRfix64K_00000 0x00000250 #define MSR_MTRRfix16K_80000 0x00000258 @@ -70,40 +96,88 @@ #define MSR_IA32_LASTINTTOIP 0x000001de /* DEBUGCTLMSR bits (others vary by model): */ -#define _DEBUGCTLMSR_LBR 0 /* last branch recording */ -#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */ - -#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR) -#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF) +#define DEBUGCTLMSR_LBR (1UL << 0) /* last branch recording */ +#define DEBUGCTLMSR_BTF (1UL << 1) /* single-step on branches */ +#define DEBUGCTLMSR_TR (1UL << 6) +#define DEBUGCTLMSR_BTS (1UL << 7) +#define DEBUGCTLMSR_BTINT (1UL << 8) +#define DEBUGCTLMSR_BTS_OFF_OS (1UL << 9) +#define DEBUGCTLMSR_BTS_OFF_USR (1UL << 10) +#define DEBUGCTLMSR_FREEZE_LBRS_ON_PMI (1UL << 11) #define MSR_IA32_MC0_CTL 0x00000400 #define MSR_IA32_MC0_STATUS 0x00000401 #define MSR_IA32_MC0_ADDR 0x00000402 #define MSR_IA32_MC0_MISC 0x00000403 +/* C-state Residency Counters */ +#define MSR_PKG_C3_RESIDENCY 0x000003f8 +#define MSR_PKG_C6_RESIDENCY 0x000003f9 +#define MSR_PKG_C7_RESIDENCY 0x000003fa +#define MSR_CORE_C3_RESIDENCY 0x000003fc +#define MSR_CORE_C6_RESIDENCY 0x000003fd +#define MSR_CORE_C7_RESIDENCY 0x000003fe +#define MSR_PKG_C2_RESIDENCY 0x0000060d +#define MSR_PKG_C8_RESIDENCY 0x00000630 +#define MSR_PKG_C9_RESIDENCY 0x00000631 +#define MSR_PKG_C10_RESIDENCY 0x00000632 + +/* Run Time Average Power Limiting (RAPL) Interface */ + +#define MSR_RAPL_POWER_UNIT 0x00000606 + +#define MSR_PKG_POWER_LIMIT 0x00000610 +#define MSR_PKG_ENERGY_STATUS 0x00000611 +#define MSR_PKG_PERF_STATUS 0x00000613 +#define MSR_PKG_POWER_INFO 0x00000614 + +#define MSR_DRAM_POWER_LIMIT 0x00000618 +#define MSR_DRAM_ENERGY_STATUS 0x00000619 +#define MSR_DRAM_PERF_STATUS 0x0000061b +#define MSR_DRAM_POWER_INFO 0x0000061c + +#define MSR_PP0_POWER_LIMIT 0x00000638 +#define MSR_PP0_ENERGY_STATUS 0x00000639 +#define MSR_PP0_POLICY 0x0000063a +#define MSR_PP0_PERF_STATUS 0x0000063b + +#define MSR_PP1_POWER_LIMIT 0x00000640 +#define MSR_PP1_ENERGY_STATUS 0x00000641 +#define MSR_PP1_POLICY 0x00000642 + +#define MSR_CORE_C1_RES 0x00000660 + +#define MSR_AMD64_MC0_MASK 0xc0010044 + #define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) #define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) #define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) #define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) +#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x)) + /* These are consecutive and not in the normal 4er MCE bank block */ #define MSR_IA32_MC0_CTL2 0x00000280 #define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) -#define CMCI_EN (1ULL << 30) -#define CMCI_THRESHOLD_MASK 0xffffULL - #define MSR_P6_PERFCTR0 0x000000c1 #define MSR_P6_PERFCTR1 0x000000c2 #define MSR_P6_EVNTSEL0 0x00000186 #define MSR_P6_EVNTSEL1 0x00000187 +/* Alternative perfctr range with full access. */ +#define MSR_IA32_PMC0 0x000004c1 + /* AMD64 MSRs. Not complete. See the architecture manual for a more complete list. */ #define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 +#define MSR_AMD64_DC_CFG 0xc0011022 +#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 +#define MSR_AMD64_OSVW_STATUS 0xc0010141 #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 #define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 @@ -115,6 +189,17 @@ #define MSR_AMD64_IBSDCLINAD 0xc0011038 #define MSR_AMD64_IBSDCPHYSAD 0xc0011039 #define MSR_AMD64_IBSCTL 0xc001103a +#define MSR_AMD64_IBSBRTARGET 0xc001103b + +/* Fam 16h MSRs */ +#define MSR_F16H_L2I_PERF_CTL 0xc0010230 +#define MSR_F16H_L2I_PERF_CTR 0xc0010231 + +/* Fam 15h MSRs */ +#define MSR_F15H_PERF_CTL 0xc0010200 +#define MSR_F15H_PERF_CTR 0xc0010201 +#define MSR_F15H_NB_PERF_CTL 0xc0010240 +#define MSR_F15H_NB_PERF_CTR 0xc0010241 /* Fam 10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 @@ -123,6 +208,7 @@ #define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff #define FAM10H_MMIO_CONF_BASE_SHIFT 20 +#define MSR_FAM10H_NODE_ID 0xc001100c /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a @@ -193,16 +279,23 @@ #define MSR_IA32_TSC 0x00000010 #define MSR_IA32_PLATFORM_ID 0x00000017 #define MSR_IA32_EBL_CR_POWERON 0x0000002a +#define MSR_EBC_FREQUENCY_ID 0x0000002c +#define MSR_SMI_COUNT 0x00000034 #define MSR_IA32_FEATURE_CONTROL 0x0000003a -#define FEATURE_CONTROL_LOCKED (1<<0) -#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) +#define MSR_IA32_XSS 0x00000da0 + +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) +#define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) +#define MSR_IA32_TSCDEADLINE 0x000006e0 + #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b @@ -215,12 +308,14 @@ #define MSR_IA32_THERM_CONTROL 0x0000019a #define MSR_IA32_THERM_INTERRUPT 0x0000019b -#define THERM_INT_LOW_ENABLE (1 << 0) -#define THERM_INT_HIGH_ENABLE (1 << 1) +#define THERM_INT_HIGH_ENABLE (1 << 0) +#define THERM_INT_LOW_ENABLE (1 << 1) +#define THERM_INT_PLN_ENABLE (1 << 24) #define MSR_IA32_THERM_STATUS 0x0000019c #define THERM_STATUS_PROCHOT (1 << 0) +#define THERM_STATUS_POWER_LIMIT (1 << 10) #define MSR_THERM2_CTL 0x0000019d @@ -228,6 +323,24 @@ #define MSR_IA32_MISC_ENABLE 0x000001a0 +#define MSR_IA32_TEMPERATURE_TARGET 0x000001a2 + +#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0 +#define ENERGY_PERF_BIAS_PERFORMANCE 0 +#define ENERGY_PERF_BIAS_NORMAL 6 +#define ENERGY_PERF_BIAS_POWERSAVE 15 + +#define MSR_IA32_PACKAGE_THERM_STATUS 0x000001b1 + +#define PACKAGE_THERM_STATUS_PROCHOT (1 << 0) +#define PACKAGE_THERM_STATUS_POWER_LIMIT (1 << 10) + +#define MSR_IA32_PACKAGE_THERM_INTERRUPT 0x000001b2 + +#define PACKAGE_THERM_INT_HIGH_ENABLE (1 << 0) +#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) +#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) + /* MISC_ENABLE bits: architectural */ #define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) #define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) @@ -355,6 +468,8 @@ #define MSR_P4_U2L_ESCR0 0x000003b0 #define MSR_P4_U2L_ESCR1 0x000003b1 +#define MSR_P4_PEBS_MATRIX_VERT 0x000003f2 + /* Intel Core-based CPU performance counters */ #define MSR_CORE_PERF_FIXED_CTR0 0x00000309 #define MSR_CORE_PERF_FIXED_CTR1 0x0000030a diff -uprN linux-2.6.32/arch/x86/include/asm/mtrr.h linux-2.6.32.ovz/arch/x86/include/asm/mtrr.h --- linux-2.6.32/arch/x86/include/asm/mtrr.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mtrr.h 2015-03-10 13:24:10.000000000 -0700 @@ -107,7 +107,10 @@ struct mtrr_state_type { #ifdef __KERNEL__ -/* The following functions are for use by other drivers */ +/* + * The following functions are for use by other drivers that cannot use + * arch_phys_wc_add and arch_phys_wc_del. + */ # ifdef CONFIG_MTRR extern u8 mtrr_type_lookup(u64 addr, u64 end); extern void mtrr_save_fixed_ranges(void *); @@ -126,6 +129,7 @@ extern void mtrr_aps_init(void); extern void mtrr_bp_restore(void); extern int mtrr_trim_uncached_memory(unsigned long end_pfn); extern int amd_special_default_mtrr(void); +extern int phys_wc_to_mtrr_index(int handle); # else static inline u8 mtrr_type_lookup(u64 addr, u64 end) { @@ -161,6 +165,10 @@ static inline int mtrr_trim_uncached_mem static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) { } +static inline int phys_wc_to_mtrr_index(int handle) +{ + return -1; +} #define mtrr_ap_init() do {} while (0) #define mtrr_bp_init() do {} while (0) diff -uprN linux-2.6.32/arch/x86/include/asm/mutex_32.h linux-2.6.32.ovz/arch/x86/include/asm/mutex_32.h --- linux-2.6.32/arch/x86/include/asm/mutex_32.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mutex_32.h 2015-03-10 13:24:10.000000000 -0700 @@ -42,17 +42,14 @@ do { \ * __mutex_fastpath_lock_retval - try to take the lock by moving the count * from 1 to a 0 value * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 1 * - * Change the count from 1 to a value lower than 1, and call if it - * wasn't 1 originally. This function returns 0 if the fastpath succeeds, - * or anything the slow path function returns + * Change the count from 1 to a value lower than 1. This function returns 0 + * if the fastpath succeeds, or -1 otherwise. */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count, - int (*fail_fn)(atomic_t *)) +static inline int __mutex_fastpath_lock_retval(atomic_t *count) { if (unlikely(atomic_dec_return(count) < 0)) - return fail_fn(count); + return -1; else return 0; } diff -uprN linux-2.6.32/arch/x86/include/asm/mutex_64.h linux-2.6.32.ovz/arch/x86/include/asm/mutex_64.h --- linux-2.6.32/arch/x86/include/asm/mutex_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mutex_64.h 2015-03-10 13:24:10.000000000 -0700 @@ -37,17 +37,14 @@ do { \ * __mutex_fastpath_lock_retval - try to take the lock by moving the count * from 1 to a 0 value * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 1 * - * Change the count from 1 to a value lower than 1, and call if - * it wasn't 1 originally. This function returns 0 if the fastpath succeeds, - * or anything the slow path function returns + * Change the count from 1 to a value lower than 1. This function returns 0 + * if the fastpath succeeds, or -1 otherwise. */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count, - int (*fail_fn)(atomic_t *)) +static inline int __mutex_fastpath_lock_retval(atomic_t *count) { if (unlikely(atomic_dec_return(count) < 0)) - return fail_fn(count); + return -1; else return 0; } diff -uprN linux-2.6.32/arch/x86/include/asm/mwait.h linux-2.6.32.ovz/arch/x86/include/asm/mwait.h --- linux-2.6.32/arch/x86/include/asm/mwait.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/mwait.h 2015-03-10 13:22:11.000000000 -0700 @@ -0,0 +1,15 @@ +#ifndef _ASM_X86_MWAIT_H +#define _ASM_X86_MWAIT_H + +#define MWAIT_SUBSTATE_MASK 0xf +#define MWAIT_CSTATE_MASK 0xf +#define MWAIT_SUBSTATE_SIZE 4 +#define MWAIT_MAX_NUM_CSTATES 8 + +#define CPUID_MWAIT_LEAF 5 +#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1 +#define CPUID5_ECX_INTERRUPT_BREAK 0x2 + +#define MWAIT_ECX_INTERRUPT_BREAK 0x1 + +#endif /* _ASM_X86_MWAIT_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/nmi.h linux-2.6.32.ovz/arch/x86/include/asm/nmi.h --- linux-2.6.32/arch/x86/include/asm/nmi.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/nmi.h 2015-03-10 13:22:31.000000000 -0700 @@ -5,7 +5,7 @@ #include #include -#ifdef ARCH_HAS_NMI_WATCHDOG +#ifdef CONFIG_X86_LOCAL_APIC /** * do_nmi_callback @@ -17,7 +17,9 @@ int do_nmi_callback(struct pt_regs *regs extern void die_nmi(char *str, struct pt_regs *regs, int do_panic); extern int check_nmi_watchdog(void); +#if !defined(CONFIG_LOCKUP_DETECTOR) extern int nmi_watchdog_enabled; +#endif extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); extern int avail_to_resrv_perfctr_nmi(unsigned int); extern int reserve_perfctr_nmi(unsigned int); @@ -34,6 +36,7 @@ extern void cpu_nmi_set_wd_enabled(void) extern atomic_t nmi_active; extern unsigned int nmi_watchdog; +#define NMI_DEFAULT -1 #define NMI_NONE 0 #define NMI_IO_APIC 1 #define NMI_LOCAL_APIC 2 @@ -43,10 +46,31 @@ struct ctl_table; extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; +extern void nmi_watchdog_default(void); void arch_trigger_all_cpu_backtrace(void); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace +/* + * Define some priorities for the nmi notifier call chain. + * + * Create a local nmi bit that has a higher priority than + * external nmis, because the local ones are more frequent. + * + * Also setup some default high/normal/low settings for + * subsystems to registers with. Using 4 bits to seperate + * the priorities. This can go alot higher if needed be. + */ + +#define NMI_LOCAL_SHIFT 16 /* randomly picked */ +#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT) +#define NMI_HIGH_PRIOR (1ULL << 8) +#define NMI_NORMAL_PRIOR (1ULL << 4) +#define NMI_LOW_PRIOR (1ULL << 0) +#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR) +#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR) +#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR) + static inline void localise_nmi_watchdog(void) { if (nmi_watchdog == NMI_IO_APIC) diff -uprN linux-2.6.32/arch/x86/include/asm/numa_64.h linux-2.6.32.ovz/arch/x86/include/asm/numa_64.h --- linux-2.6.32/arch/x86/include/asm/numa_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/numa_64.h 2015-03-10 13:24:13.000000000 -0700 @@ -12,8 +12,6 @@ struct bootnode { extern int compute_hash_shift(struct bootnode *nodes, int numblks, int *nodeids); -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) - extern void numa_init_array(void); extern int numa_off; diff -uprN linux-2.6.32/arch/x86/include/asm/page_32_types.h linux-2.6.32.ovz/arch/x86/include/asm/page_32_types.h --- linux-2.6.32/arch/x86/include/asm/page_32_types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/page_32_types.h 2015-03-10 13:24:15.000000000 -0700 @@ -22,7 +22,6 @@ #endif #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) -#define STACKFAULT_STACK 0 #define DOUBLEFAULT_STACK 1 #define NMI_STACK 0 #define DEBUG_STACK 0 diff -uprN linux-2.6.32/arch/x86/include/asm/page_64_types.h linux-2.6.32.ovz/arch/x86/include/asm/page_64_types.h --- linux-2.6.32/arch/x86/include/asm/page_64_types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/page_64_types.h 2015-06-23 04:16:16.000000000 -0700 @@ -1,7 +1,12 @@ #ifndef _ASM_X86_PAGE_64_DEFS_H #define _ASM_X86_PAGE_64_DEFS_H -#define THREAD_ORDER 1 +#ifdef CONFIG_16KSTACKS +# define THREAD_ORDER 2 +#else +# define THREAD_ORDER 1 +#endif + #define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) #define CURRENT_MASK (~(THREAD_SIZE - 1)) @@ -14,12 +19,11 @@ #define IRQ_STACK_ORDER 2 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define STACKFAULT_STACK 1 -#define DOUBLEFAULT_STACK 2 -#define NMI_STACK 3 -#define DEBUG_STACK 4 -#define MCE_STACK 5 -#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ +#define DOUBLEFAULT_STACK 1 +#define NMI_STACK 2 +#define DEBUG_STACK 3 +#define MCE_STACK 4 +#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ #define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT) #define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1)) diff -uprN linux-2.6.32/arch/x86/include/asm/page_types.h linux-2.6.32.ovz/arch/x86/include/asm/page_types.h --- linux-2.6.32/arch/x86/include/asm/page_types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/page_types.h 2015-03-10 13:21:18.000000000 -0700 @@ -40,7 +40,6 @@ #ifndef __ASSEMBLY__ -extern int page_is_ram(unsigned long pagenr); extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; diff -uprN linux-2.6.32/arch/x86/include/asm/paravirt.h linux-2.6.32.ovz/arch/x86/include/asm/paravirt.h --- linux-2.6.32/arch/x86/include/asm/paravirt.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/paravirt.h 2015-03-10 13:24:14.000000000 -0700 @@ -112,7 +112,7 @@ static inline void raw_safe_halt(void) static inline void halt(void) { - PVOP_VCALL0(pv_irq_ops.safe_halt); + PVOP_VCALL0(pv_irq_ops.halt); } static inline void wbinvd(void) @@ -230,6 +230,13 @@ static inline unsigned long long paravir return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); } +extern bool paravirt_steal_enabled; + +static inline u64 paravirt_steal_clock(int cpu) +{ + return PVOP_CALL1(u64, pv_time_ops.steal_clock, cpu); +} + static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter); @@ -242,6 +249,8 @@ do { \ high = _l >> 32; \ } while (0) +#define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) + static inline unsigned long long paravirt_rdtscp(unsigned int *aux) { return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); @@ -289,6 +298,12 @@ static inline void set_ldt(const void *a { PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); } +#ifdef CONFIG_X86_32 +static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm) +{ + PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm); +} +#endif /*CONFIG_X86_32*/ static inline void store_gdt(struct desc_ptr *dtr) { PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); @@ -435,20 +450,16 @@ static inline void paravirt_release_pud( PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); } -#ifdef CONFIG_HIGHPTE -static inline void *kmap_atomic_pte(struct page *page, enum km_type type) -{ - unsigned long ret; - ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type); - return (void *)ret; -} -#endif - static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); } +static inline void pmd_update(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp); +} static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -456,6 +467,12 @@ static inline void pte_update_defer(stru PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); } +static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp); +} + static inline pte_t __pte(pteval_t val) { pteval_t ret; @@ -557,6 +574,18 @@ static inline void set_pte_at(struct mm_ PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + if (sizeof(pmdval_t) > sizeof(long)) + /* 5 arg words */ + pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); + else + PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, pmd.pmd); +} +#endif + static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { pmdval_t val = native_pmd_val(pmd); diff -uprN linux-2.6.32/arch/x86/include/asm/paravirt_types.h linux-2.6.32.ovz/arch/x86/include/asm/paravirt_types.h --- linux-2.6.32/arch/x86/include/asm/paravirt_types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/paravirt_types.h 2015-03-10 13:23:14.000000000 -0700 @@ -89,7 +89,7 @@ struct pv_lazy_ops { struct pv_time_ops { unsigned long long (*sched_clock)(void); - unsigned long (*get_tsc_khz)(void); + unsigned long long (*steal_clock)(int cpu); }; struct pv_cpu_ops { @@ -118,6 +118,9 @@ struct pv_cpu_ops { void (*store_gdt)(struct desc_ptr *); void (*store_idt)(struct desc_ptr *); void (*set_ldt)(const void *desc, unsigned entries); +#ifdef CONFIG_X86_32 + void (*load_user_cs_desc)(int cpu, struct mm_struct *mm); +#endif unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); #ifdef CONFIG_X86_64 @@ -266,10 +269,16 @@ struct pv_mmu_ops { void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); + void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmdval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void (*pte_update_defer)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); + void (*pmd_update)(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp); + void (*pmd_update_defer)(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp); pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -304,10 +313,6 @@ struct pv_mmu_ops { #endif /* PAGETABLE_LEVELS == 4 */ #endif /* PAGETABLE_LEVELS >= 3 */ -#ifdef CONFIG_HIGHPTE - void *(*kmap_atomic_pte)(struct page *page, enum km_type type); -#endif - struct pv_lazy_ops lazy_mode; /* dom0 ops */ diff -uprN linux-2.6.32/arch/x86/include/asm/pci.h linux-2.6.32.ovz/arch/x86/include/asm/pci.h --- linux-2.6.32/arch/x86/include/asm/pci.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pci.h 2015-03-10 13:24:08.000000000 -0700 @@ -22,11 +22,6 @@ extern int pci_routeirq; extern int noioapicquirk; extern int noioapicreroute; -/* scan a bus after allocating a pci_sysdata for it */ -extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops, - int node); -extern struct pci_bus *pci_scan_bus_with_sysdata(int busno); - static inline int pci_domain_nr(struct pci_bus *bus) { struct pci_sysdata *sd = bus->sysdata; @@ -56,7 +51,7 @@ extern unsigned long pci_mem_start; #define PCIBIOS_MIN_CARDBUS_IO 0x4000 void pcibios_config_init(void); -struct pci_bus *pcibios_scan_root(int bus); +void pcibios_scan_root(int bus); void pcibios_set_master(struct pci_dev *dev); void pcibios_penalize_isa_irq(int irq, int active); @@ -90,34 +85,6 @@ extern void pci_iommu_alloc(void); #define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) -#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG) - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \ - dma_addr_t ADDR_NAME; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \ - __u32 LEN_NAME; -#define pci_unmap_addr(PTR, ADDR_NAME) \ - ((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - (((PTR)->ADDR_NAME) = (VAL)) -#define pci_unmap_len(PTR, LEN_NAME) \ - ((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - (((PTR)->LEN_NAME) = (VAL)) - -#else - -#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0]; -#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0]; -#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME) -#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \ - do { break; } while (pci_unmap_addr(PTR, ADDR_NAME)) -#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME) -#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \ - do { break; } while (pci_unmap_len(PTR, LEN_NAME)) - -#endif - #endif /* __KERNEL__ */ #ifdef CONFIG_X86_64 @@ -127,6 +94,8 @@ extern void pci_iommu_alloc(void); /* implement the pci_ DMA API in terms of the generic device dma_ one */ #include +#include + /* generic pci stuff */ #include #define PCIBIOS_MAX_MEM_32 0xffffffff diff -uprN linux-2.6.32/arch/x86/include/asm/pci_x86.h linux-2.6.32.ovz/arch/x86/include/asm/pci_x86.h --- linux-2.6.32/arch/x86/include/asm/pci_x86.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pci_x86.h 2015-03-10 13:24:08.000000000 -0700 @@ -29,6 +29,8 @@ #define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 #define PCI_HAS_IO_ECS 0x40000 #define PCI_NOASSIGN_ROMS 0x80000 +#define PCI_ROOT_NO_CRS 0x100000 +#define PCI_NOASSIGN_BARS 0x200000 extern unsigned int pci_probe; extern unsigned long pirq_table_addr; @@ -49,9 +51,10 @@ void pcibios_resource_survey(void); /* pci-pc.c */ extern int pcibios_last_bus; -extern struct pci_bus *pci_root_bus; extern struct pci_ops pci_root_ops; +void pcibios_scan_specific_bus(int busn); + /* pci-irq.c */ struct irq_info { diff -uprN linux-2.6.32/arch/x86/include/asm/percpu.h linux-2.6.32.ovz/arch/x86/include/asm/percpu.h --- linux-2.6.32/arch/x86/include/asm/percpu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/percpu.h 2015-03-10 13:22:05.000000000 -0700 @@ -133,6 +133,29 @@ do { \ ret__; \ }) +#define percpu_unary_op(op, var) \ +({ \ + switch (sizeof(var)) { \ + case 1: \ + asm(op "b "__percpu_arg(0) \ + : "+m" (var)); \ + break; \ + case 2: \ + asm(op "w "__percpu_arg(0) \ + : "+m" (var)); \ + break; \ + case 4: \ + asm(op "l "__percpu_arg(0) \ + : "+m" (var)); \ + break; \ + case 8: \ + asm(op "q "__percpu_arg(0) \ + : "+m" (var)); \ + break; \ + default: __bad_percpu_size(); \ + } \ +}) + /* * percpu_read() makes gcc load the percpu variable every time it is * accessed while percpu_read_stable() allows the value to be cached. @@ -152,6 +175,7 @@ do { \ #define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) #define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) #define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) +#define percpu_inc(var) percpu_unary_op("inc", per_cpu__##var) /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ diff -uprN linux-2.6.32/arch/x86/include/asm/perf_event.h linux-2.6.32.ovz/arch/x86/include/asm/perf_event.h --- linux-2.6.32/arch/x86/include/asm/perf_event.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/perf_event.h 2015-03-10 13:24:14.000000000 -0700 @@ -5,11 +5,10 @@ * Performance event hw details: */ -#define X86_PMC_MAX_GENERIC 8 -#define X86_PMC_MAX_FIXED 3 +#define INTEL_PMC_MAX_GENERIC 32 +#define INTEL_PMC_MAX_FIXED 3 +#define INTEL_PMC_IDX_FIXED 32 -#define X86_PMC_IDX_GENERIC 0 -#define X86_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 @@ -18,23 +17,58 @@ #define MSR_ARCH_PERFMON_EVENTSEL0 0x186 #define MSR_ARCH_PERFMON_EVENTSEL1 0x187 -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) +#define ARCH_PERFMON_EVENTSEL_EVENT 0x000000FFULL +#define ARCH_PERFMON_EVENTSEL_UMASK 0x0000FF00ULL +#define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16) +#define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) +#define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18) +#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19) +#define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20) +#define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21) +#define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) +#define ARCH_PERFMON_EVENTSEL_INV (1ULL << 23) +#define ARCH_PERFMON_EVENTSEL_CMASK 0xFF000000ULL + +#define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) +#define AMD_PERFMON_EVENTSEL_GUESTONLY (1ULL << 40) +#define AMD_PERFMON_EVENTSEL_HOSTONLY (1ULL << 41) + +#define AMD64_EVENTSEL_INT_CORE_SEL_SHIFT 37 +#define AMD64_EVENTSEL_INT_CORE_SEL_MASK \ + (0xFULL << AMD64_EVENTSEL_INT_CORE_SEL_SHIFT) + +#define HSW_IN_TX (1ULL << 32) +#define HSW_IN_TX_CHECKPOINTED (1ULL << 33) + +#define AMD64_EVENTSEL_EVENT \ + (ARCH_PERFMON_EVENTSEL_EVENT | (0x0FULL << 32)) +#define INTEL_ARCH_EVENT_MASK \ + (ARCH_PERFMON_EVENTSEL_UMASK | ARCH_PERFMON_EVENTSEL_EVENT) + +#define X86_RAW_EVENT_MASK \ + (ARCH_PERFMON_EVENTSEL_EVENT | \ + ARCH_PERFMON_EVENTSEL_UMASK | \ + ARCH_PERFMON_EVENTSEL_EDGE | \ + ARCH_PERFMON_EVENTSEL_INV | \ + ARCH_PERFMON_EVENTSEL_CMASK) +#define AMD64_RAW_EVENT_MASK \ + (X86_RAW_EVENT_MASK | \ + AMD64_EVENTSEL_EVENT) +#define AMD64_RAW_EVENT_MASK_NB \ + (AMD64_EVENTSEL_EVENT | \ + ARCH_PERFMON_EVENTSEL_UMASK) +#define AMD64_NUM_COUNTERS_NB 4 +#define AMD64_NUM_COUNTERS 4 +#define AMD64_NUM_COUNTERS_CORE 6 -/* - * Includes eventsel and unit mask as well: - */ -#define ARCH_PERFMON_EVENT_MASK 0xffff - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) -#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +#define ARCH_PERFMON_EVENTS_COUNT 7 /* * Intel "Architectural Performance Monitoring" CPUID @@ -43,21 +77,44 @@ union cpuid10_eax { struct { unsigned int version_id:8; - unsigned int num_events:8; + unsigned int num_counters:8; unsigned int bit_width:8; unsigned int mask_length:8; } split; unsigned int full; }; +union cpuid10_ebx { + struct { + unsigned int no_unhalted_core_cycles:1; + unsigned int no_instructions_retired:1; + unsigned int no_unhalted_reference_cycles:1; + unsigned int no_llc_reference:1; + unsigned int no_llc_misses:1; + unsigned int no_branch_instruction_retired:1; + unsigned int no_branch_misses_retired:1; + } split; + unsigned int full; +}; + union cpuid10_edx { struct { - unsigned int num_events_fixed:4; - unsigned int reserved:28; + unsigned int num_counters_fixed:5; + unsigned int bit_width_fixed:8; + unsigned int reserved:19; } split; unsigned int full; }; +struct x86_pmu_capability { + int version; + int num_counters_gp; + int num_counters_fixed; + int bit_width_gp; + int bit_width_fixed; + unsigned int events_mask; + int events_mask_len; +}; /* * Fixed-purpose performance events: @@ -66,23 +123,24 @@ union cpuid10_edx { /* * All 3 fixed-mode PMCs are configured via this single MSR: */ -#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d /* * The counts are available in three separate MSRs: */ /* Instr_Retired.Any: */ -#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define INTEL_PMC_IDX_FIXED_INSTRUCTIONS (INTEL_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ -#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define INTEL_PMC_IDX_FIXED_CPU_CYCLES (INTEL_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ -#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define INTEL_PMC_IDX_FIXED_REF_CYCLES (INTEL_PMC_IDX_FIXED + 2) +#define INTEL_PMC_MSK_FIXED_REF_CYCLES (1ULL << INTEL_PMC_IDX_FIXED_REF_CYCLES) /* * We model BTS tracing as another fixed-mode PMC. @@ -91,18 +149,129 @@ union cpuid10_edx { * values are used by actual fixed events and higher values are used * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. */ -#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) +#define INTEL_PMC_IDX_FIXED_BTS (INTEL_PMC_IDX_FIXED + 16) + +/* + * IBS cpuid feature detection + */ + +#define IBS_CPUID_FEATURES 0x8000001b + +/* + * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but + * bit 0 is used to indicate the existence of IBS. + */ +#define IBS_CAPS_AVAIL (1U<<0) +#define IBS_CAPS_FETCHSAM (1U<<1) +#define IBS_CAPS_OPSAM (1U<<2) +#define IBS_CAPS_RDWROPCNT (1U<<3) +#define IBS_CAPS_OPCNT (1U<<4) +#define IBS_CAPS_BRNTRGT (1U<<5) +#define IBS_CAPS_OPCNTEXT (1U<<6) + +#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ + | IBS_CAPS_FETCHSAM \ + | IBS_CAPS_OPSAM) + +/* + * IBS APIC setup + */ +#define IBSCTL 0x1cc +#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8) +#define IBSCTL_LVT_OFFSET_MASK 0x0F +/* + * IBS randomization macros + */ +#define IBS_RANDOM_BITS 12 +#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1) +#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5)) + +#ifdef CONFIG_X86_LOCAL_APIC +extern u32 get_ibs_caps(void); +#else +static inline u32 get_ibs_caps(void) { return 0; } +#endif + +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_RAND_EN (1ULL<<57) +#define IBS_FETCH_VAL (1ULL<<49) +#define IBS_FETCH_ENABLE (1ULL<<48) +#define IBS_FETCH_CNT 0xFFFF0000ULL +#define IBS_FETCH_MAX_CNT 0x0000FFFFULL + +/* IbsOpCtl bits */ +#define IBS_OP_CNT_CTL (1ULL<<19) +#define IBS_OP_VAL (1ULL<<18) +#define IBS_OP_ENABLE (1ULL<<17) +#define IBS_OP_MAX_CNT 0x0000FFFFULL +#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */ #ifdef CONFIG_PERF_EVENTS -extern void init_hw_perf_events(void); extern void perf_events_lapic_init(void); -#define PERF_EVENT_INDEX_OFFSET 0 +/* + * Abuse bit 3 of the cpu eflags register to indicate proper PEBS IP fixups. + * This flag is otherwise unused and ABI specified to be 0, so nobody should + * care what we do with it. + */ +#define PERF_EFLAGS_EXACT (1UL << 3) + +struct pt_regs; +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); +extern unsigned long perf_misc_flags(struct pt_regs *regs); +#define perf_misc_flags(regs) perf_misc_flags(regs) + +#include +/* + * We abuse bit 3 from flags to pass exact information, see perf_misc_flags + * and the comment with PERF_EFLAGS_EXACT. + */ +#define perf_arch_fetch_caller_regs(regs, __ip) { \ + (regs)->ip = (__ip); \ + (regs)->bp = caller_frame_pointer(); \ + (regs)->cs = __KERNEL_CS; \ + regs->flags = 0; \ + asm volatile( \ + _ASM_MOV "%%"_ASM_SP ", %0\n" \ + : "=m" ((regs)->sp) \ + :: "memory" \ + ); \ +} + +struct perf_guest_switch_msr { + unsigned msr; + u64 host, guest; +}; + +extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); +extern void perf_check_microcode(void); #else -static inline void init_hw_perf_events(void) { } +static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + *nr = 0; + return NULL; +} + +static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + memset(cap, 0, sizeof(*cap)); +} + static inline void perf_events_lapic_init(void) { } +static inline void perf_check_microcode(void) { } #endif +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_AMD) + extern void amd_pmu_enable_virt(void); + extern void amd_pmu_disable_virt(void); +#else + static inline void amd_pmu_enable_virt(void) { } + static inline void amd_pmu_disable_virt(void) { } +#endif + +#define arch_perf_out_copy_user copy_from_user_nmi + #endif /* _ASM_X86_PERF_EVENT_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/perf_event_p4.h linux-2.6.32.ovz/arch/x86/include/asm/perf_event_p4.h --- linux-2.6.32/arch/x86/include/asm/perf_event_p4.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/perf_event_p4.h 2015-03-10 13:22:38.000000000 -0700 @@ -0,0 +1,876 @@ +/* + * Netburst Performance Events (P4, old Xeon) + */ + +#ifndef PERF_EVENT_P4_H +#define PERF_EVENT_P4_H + +#include +#include + +/* + * NetBurst has performance MSRs shared between + * threads if HT is turned on, ie for both logical + * processors (mem: in turn in Atom with HT support + * perf-MSRs are not shared and every thread has its + * own perf-MSRs set) + */ +#define ARCH_P4_TOTAL_ESCR (46) +#define ARCH_P4_RESERVED_ESCR (2) /* IQ_ESCR(0,1) not always present */ +#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) +#define ARCH_P4_MAX_CCCR (18) + +#define ARCH_P4_CNTRVAL_BITS (40) +#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1) +#define ARCH_P4_UNFLAGGED_BIT ((1ULL) << (ARCH_P4_CNTRVAL_BITS - 1)) + +#define P4_ESCR_EVENT_MASK 0x7e000000U +#define P4_ESCR_EVENT_SHIFT 25 +#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U +#define P4_ESCR_EVENTMASK_SHIFT 9 +#define P4_ESCR_TAG_MASK 0x000001e0U +#define P4_ESCR_TAG_SHIFT 5 +#define P4_ESCR_TAG_ENABLE 0x00000010U +#define P4_ESCR_T0_OS 0x00000008U +#define P4_ESCR_T0_USR 0x00000004U +#define P4_ESCR_T1_OS 0x00000002U +#define P4_ESCR_T1_USR 0x00000001U + +#define P4_ESCR_EVENT(v) ((v) << P4_ESCR_EVENT_SHIFT) +#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT) +#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT) + +#define P4_CCCR_OVF 0x80000000U +#define P4_CCCR_CASCADE 0x40000000U +#define P4_CCCR_OVF_PMI_T0 0x04000000U +#define P4_CCCR_OVF_PMI_T1 0x08000000U +#define P4_CCCR_FORCE_OVF 0x02000000U +#define P4_CCCR_EDGE 0x01000000U +#define P4_CCCR_THRESHOLD_MASK 0x00f00000U +#define P4_CCCR_THRESHOLD_SHIFT 20 +#define P4_CCCR_COMPLEMENT 0x00080000U +#define P4_CCCR_COMPARE 0x00040000U +#define P4_CCCR_ESCR_SELECT_MASK 0x0000e000U +#define P4_CCCR_ESCR_SELECT_SHIFT 13 +#define P4_CCCR_ENABLE 0x00001000U +#define P4_CCCR_THREAD_SINGLE 0x00010000U +#define P4_CCCR_THREAD_BOTH 0x00020000U +#define P4_CCCR_THREAD_ANY 0x00030000U +#define P4_CCCR_RESERVED 0x00000fffU + +#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) +#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) + +#define P4_GEN_ESCR_EMASK(class, name, bit) \ + class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) +#define P4_ESCR_EMASK_BIT(class, name) class##__##name + +/* + * config field is 64bit width and consists of + * HT << 63 | ESCR << 32 | CCCR + * where HT is HyperThreading bit (since ESCR + * has it reserved we may use it for own purpose) + * + * note that this is NOT the addresses of respective + * ESCR and CCCR but rather an only packed value should + * be unpacked and written to a proper addresses + * + * the base idea is to pack as much info as possible + */ +#define p4_config_pack_escr(v) (((u64)(v)) << 32) +#define p4_config_pack_cccr(v) (((u64)(v)) & 0xffffffffULL) +#define p4_config_unpack_escr(v) (((u64)(v)) >> 32) +#define p4_config_unpack_cccr(v) (((u64)(v)) & 0xffffffffULL) + +#define p4_config_unpack_emask(v) \ + ({ \ + u32 t = p4_config_unpack_escr((v)); \ + t = t & P4_ESCR_EVENTMASK_MASK; \ + t = t >> P4_ESCR_EVENTMASK_SHIFT; \ + t; \ + }) + +#define p4_config_unpack_event(v) \ + ({ \ + u32 t = p4_config_unpack_escr((v)); \ + t = t & P4_ESCR_EVENT_MASK; \ + t = t >> P4_ESCR_EVENT_SHIFT; \ + t; \ + }) + +#define P4_CONFIG_HT_SHIFT 63 +#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) + +/* + * If an event has alias it should be marked + * with a special bit. (Don't forget to check + * P4_PEBS_CONFIG_MASK and related bits on + * modification.) + */ +#define P4_CONFIG_ALIASABLE (1 << 9) + +/* + * The bits we allow to pass for RAW events + */ +#define P4_CONFIG_MASK_ESCR \ + P4_ESCR_EVENT_MASK | \ + P4_ESCR_EVENTMASK_MASK | \ + P4_ESCR_TAG_MASK | \ + P4_ESCR_TAG_ENABLE + +#define P4_CONFIG_MASK_CCCR \ + P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_RESERVED + +/* some dangerous bits are reserved for kernel internals */ +#define P4_CONFIG_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \ + (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR)) + +/* + * In case of event aliasing we need to preserve some + * caller bits, otherwise the mapping won't be complete. + */ +#define P4_CONFIG_EVENT_ALIAS_MASK \ + (p4_config_pack_escr(P4_CONFIG_MASK_ESCR) | \ + p4_config_pack_cccr(P4_CCCR_EDGE | \ + P4_CCCR_THRESHOLD_MASK | \ + P4_CCCR_COMPLEMENT | \ + P4_CCCR_COMPARE)) + +#define P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS \ + ((P4_CONFIG_HT) | \ + p4_config_pack_escr(P4_ESCR_T0_OS | \ + P4_ESCR_T0_USR | \ + P4_ESCR_T1_OS | \ + P4_ESCR_T1_USR) | \ + p4_config_pack_cccr(P4_CCCR_OVF | \ + P4_CCCR_CASCADE | \ + P4_CCCR_FORCE_OVF | \ + P4_CCCR_THREAD_ANY | \ + P4_CCCR_OVF_PMI_T0 | \ + P4_CCCR_OVF_PMI_T1 | \ + P4_CONFIG_ALIASABLE)) + +static inline bool p4_is_event_cascaded(u64 config) +{ + u32 cccr = p4_config_unpack_cccr(config); + return !!(cccr & P4_CCCR_CASCADE); +} + +static inline int p4_ht_config_thread(u64 config) +{ + return !!(config & P4_CONFIG_HT); +} + +static inline u64 p4_set_ht_bit(u64 config) +{ + return config | P4_CONFIG_HT; +} + +static inline u64 p4_clear_ht_bit(u64 config) +{ + return config & ~P4_CONFIG_HT; +} + +static inline int p4_ht_active(void) +{ +#ifdef CONFIG_SMP + return smp_num_siblings > 1; +#endif + return 0; +} + +static inline int p4_ht_thread(int cpu) +{ +#ifdef CONFIG_SMP + if (smp_num_siblings == 2) + return cpu != cpumask_first(__get_cpu_var(cpu_sibling_map)); +#endif + return 0; +} + +static inline int p4_should_swap_ts(u64 config, int cpu) +{ + return p4_ht_config_thread(config) ^ p4_ht_thread(cpu); +} + +static inline u32 p4_default_cccr_conf(int cpu) +{ + /* + * Note that P4_CCCR_THREAD_ANY is "required" on + * non-HT machines (on HT machines we count TS events + * regardless the state of second logical processor + */ + u32 cccr = P4_CCCR_THREAD_ANY; + + if (!p4_ht_thread(cpu)) + cccr |= P4_CCCR_OVF_PMI_T0; + else + cccr |= P4_CCCR_OVF_PMI_T1; + + return cccr; +} + +static inline u32 p4_default_escr_conf(int cpu, int exclude_os, int exclude_usr) +{ + u32 escr = 0; + + if (!p4_ht_thread(cpu)) { + if (!exclude_os) + escr |= P4_ESCR_T0_OS; + if (!exclude_usr) + escr |= P4_ESCR_T0_USR; + } else { + if (!exclude_os) + escr |= P4_ESCR_T1_OS; + if (!exclude_usr) + escr |= P4_ESCR_T1_USR; + } + + return escr; +} + +/* + * This are the events which should be used in "Event Select" + * field of ESCR register, they are like unique keys which allow + * the kernel to determinate which CCCR and COUNTER should be + * used to track an event + */ +enum P4_EVENTS { + P4_EVENT_TC_DELIVER_MODE, + P4_EVENT_BPU_FETCH_REQUEST, + P4_EVENT_ITLB_REFERENCE, + P4_EVENT_MEMORY_CANCEL, + P4_EVENT_MEMORY_COMPLETE, + P4_EVENT_LOAD_PORT_REPLAY, + P4_EVENT_STORE_PORT_REPLAY, + P4_EVENT_MOB_LOAD_REPLAY, + P4_EVENT_PAGE_WALK_TYPE, + P4_EVENT_BSQ_CACHE_REFERENCE, + P4_EVENT_IOQ_ALLOCATION, + P4_EVENT_IOQ_ACTIVE_ENTRIES, + P4_EVENT_FSB_DATA_ACTIVITY, + P4_EVENT_BSQ_ALLOCATION, + P4_EVENT_BSQ_ACTIVE_ENTRIES, + P4_EVENT_SSE_INPUT_ASSIST, + P4_EVENT_PACKED_SP_UOP, + P4_EVENT_PACKED_DP_UOP, + P4_EVENT_SCALAR_SP_UOP, + P4_EVENT_SCALAR_DP_UOP, + P4_EVENT_64BIT_MMX_UOP, + P4_EVENT_128BIT_MMX_UOP, + P4_EVENT_X87_FP_UOP, + P4_EVENT_TC_MISC, + P4_EVENT_GLOBAL_POWER_EVENTS, + P4_EVENT_TC_MS_XFER, + P4_EVENT_UOP_QUEUE_WRITES, + P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, + P4_EVENT_RETIRED_BRANCH_TYPE, + P4_EVENT_RESOURCE_STALL, + P4_EVENT_WC_BUFFER, + P4_EVENT_B2B_CYCLES, + P4_EVENT_BNR, + P4_EVENT_SNOOP, + P4_EVENT_RESPONSE, + P4_EVENT_FRONT_END_EVENT, + P4_EVENT_EXECUTION_EVENT, + P4_EVENT_REPLAY_EVENT, + P4_EVENT_INSTR_RETIRED, + P4_EVENT_UOPS_RETIRED, + P4_EVENT_UOP_TYPE, + P4_EVENT_BRANCH_RETIRED, + P4_EVENT_MISPRED_BRANCH_RETIRED, + P4_EVENT_X87_ASSIST, + P4_EVENT_MACHINE_CLEAR, + P4_EVENT_INSTR_COMPLETED, +}; + +#define P4_OPCODE(event) event##_OPCODE +#define P4_OPCODE_ESEL(opcode) ((opcode & 0x00ff) >> 0) +#define P4_OPCODE_EVNT(opcode) ((opcode & 0xff00) >> 8) +#define P4_OPCODE_PACK(event, sel) (((event) << 8) | sel) + +/* + * Comments below the event represent ESCR restriction + * for this event and counter index per ESCR + * + * MSR_P4_IQ_ESCR0 and MSR_P4_IQ_ESCR1 are available only on early + * processor builds (family 0FH, models 01H-02H). These MSRs + * are not available on later versions, so that we don't use + * them completely + * + * Also note that CCCR1 do not have P4_CCCR_ENABLE bit properly + * working so that we should not use this CCCR and respective + * counter as result + */ +enum P4_EVENT_OPCODES { + P4_OPCODE(P4_EVENT_TC_DELIVER_MODE) = P4_OPCODE_PACK(0x01, 0x01), + /* + * MSR_P4_TC_ESCR0: 4, 5 + * MSR_P4_TC_ESCR1: 6, 7 + */ + + P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST) = P4_OPCODE_PACK(0x03, 0x00), + /* + * MSR_P4_BPU_ESCR0: 0, 1 + * MSR_P4_BPU_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_ITLB_REFERENCE) = P4_OPCODE_PACK(0x18, 0x03), + /* + * MSR_P4_ITLB_ESCR0: 0, 1 + * MSR_P4_ITLB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_MEMORY_CANCEL) = P4_OPCODE_PACK(0x02, 0x05), + /* + * MSR_P4_DAC_ESCR0: 8, 9 + * MSR_P4_DAC_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_MEMORY_COMPLETE) = P4_OPCODE_PACK(0x08, 0x02), + /* + * MSR_P4_SAAT_ESCR0: 8, 9 + * MSR_P4_SAAT_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY) = P4_OPCODE_PACK(0x04, 0x02), + /* + * MSR_P4_SAAT_ESCR0: 8, 9 + * MSR_P4_SAAT_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY) = P4_OPCODE_PACK(0x05, 0x02), + /* + * MSR_P4_SAAT_ESCR0: 8, 9 + * MSR_P4_SAAT_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY) = P4_OPCODE_PACK(0x03, 0x02), + /* + * MSR_P4_MOB_ESCR0: 0, 1 + * MSR_P4_MOB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE) = P4_OPCODE_PACK(0x01, 0x04), + /* + * MSR_P4_PMH_ESCR0: 0, 1 + * MSR_P4_PMH_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE) = P4_OPCODE_PACK(0x0c, 0x07), + /* + * MSR_P4_BSU_ESCR0: 0, 1 + * MSR_P4_BSU_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_IOQ_ALLOCATION) = P4_OPCODE_PACK(0x03, 0x06), + /* + * MSR_P4_FSB_ESCR0: 0, 1 + * MSR_P4_FSB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x1a, 0x06), + /* + * MSR_P4_FSB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY) = P4_OPCODE_PACK(0x17, 0x06), + /* + * MSR_P4_FSB_ESCR0: 0, 1 + * MSR_P4_FSB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_BSQ_ALLOCATION) = P4_OPCODE_PACK(0x05, 0x07), + /* + * MSR_P4_BSU_ESCR0: 0, 1 + */ + + P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES) = P4_OPCODE_PACK(0x06, 0x07), + /* + * NOTE: no ESCR name in docs, it's guessed + * MSR_P4_BSU_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST) = P4_OPCODE_PACK(0x34, 0x01), + /* + * MSR_P4_FIRM_ESCR0: 8, 9 + * MSR_P4_FIRM_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_PACKED_SP_UOP) = P4_OPCODE_PACK(0x08, 0x01), + /* + * MSR_P4_FIRM_ESCR0: 8, 9 + * MSR_P4_FIRM_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_PACKED_DP_UOP) = P4_OPCODE_PACK(0x0c, 0x01), + /* + * MSR_P4_FIRM_ESCR0: 8, 9 + * MSR_P4_FIRM_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_SCALAR_SP_UOP) = P4_OPCODE_PACK(0x0a, 0x01), + /* + * MSR_P4_FIRM_ESCR0: 8, 9 + * MSR_P4_FIRM_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_SCALAR_DP_UOP) = P4_OPCODE_PACK(0x0e, 0x01), + /* + * MSR_P4_FIRM_ESCR0: 8, 9 + * MSR_P4_FIRM_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_64BIT_MMX_UOP) = P4_OPCODE_PACK(0x02, 0x01), + /* + * MSR_P4_FIRM_ESCR0: 8, 9 + * MSR_P4_FIRM_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_128BIT_MMX_UOP) = P4_OPCODE_PACK(0x1a, 0x01), + /* + * MSR_P4_FIRM_ESCR0: 8, 9 + * MSR_P4_FIRM_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_X87_FP_UOP) = P4_OPCODE_PACK(0x04, 0x01), + /* + * MSR_P4_FIRM_ESCR0: 8, 9 + * MSR_P4_FIRM_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_TC_MISC) = P4_OPCODE_PACK(0x06, 0x01), + /* + * MSR_P4_TC_ESCR0: 4, 5 + * MSR_P4_TC_ESCR1: 6, 7 + */ + + P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS) = P4_OPCODE_PACK(0x13, 0x06), + /* + * MSR_P4_FSB_ESCR0: 0, 1 + * MSR_P4_FSB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_TC_MS_XFER) = P4_OPCODE_PACK(0x05, 0x00), + /* + * MSR_P4_MS_ESCR0: 4, 5 + * MSR_P4_MS_ESCR1: 6, 7 + */ + + P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES) = P4_OPCODE_PACK(0x09, 0x00), + /* + * MSR_P4_MS_ESCR0: 4, 5 + * MSR_P4_MS_ESCR1: 6, 7 + */ + + P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x05, 0x02), + /* + * MSR_P4_TBPU_ESCR0: 4, 5 + * MSR_P4_TBPU_ESCR1: 6, 7 + */ + + P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE) = P4_OPCODE_PACK(0x04, 0x02), + /* + * MSR_P4_TBPU_ESCR0: 4, 5 + * MSR_P4_TBPU_ESCR1: 6, 7 + */ + + P4_OPCODE(P4_EVENT_RESOURCE_STALL) = P4_OPCODE_PACK(0x01, 0x01), + /* + * MSR_P4_ALF_ESCR0: 12, 13, 16 + * MSR_P4_ALF_ESCR1: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_WC_BUFFER) = P4_OPCODE_PACK(0x05, 0x05), + /* + * MSR_P4_DAC_ESCR0: 8, 9 + * MSR_P4_DAC_ESCR1: 10, 11 + */ + + P4_OPCODE(P4_EVENT_B2B_CYCLES) = P4_OPCODE_PACK(0x16, 0x03), + /* + * MSR_P4_FSB_ESCR0: 0, 1 + * MSR_P4_FSB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_BNR) = P4_OPCODE_PACK(0x08, 0x03), + /* + * MSR_P4_FSB_ESCR0: 0, 1 + * MSR_P4_FSB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_SNOOP) = P4_OPCODE_PACK(0x06, 0x03), + /* + * MSR_P4_FSB_ESCR0: 0, 1 + * MSR_P4_FSB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_RESPONSE) = P4_OPCODE_PACK(0x04, 0x03), + /* + * MSR_P4_FSB_ESCR0: 0, 1 + * MSR_P4_FSB_ESCR1: 2, 3 + */ + + P4_OPCODE(P4_EVENT_FRONT_END_EVENT) = P4_OPCODE_PACK(0x08, 0x05), + /* + * MSR_P4_CRU_ESCR2: 12, 13, 16 + * MSR_P4_CRU_ESCR3: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_EXECUTION_EVENT) = P4_OPCODE_PACK(0x0c, 0x05), + /* + * MSR_P4_CRU_ESCR2: 12, 13, 16 + * MSR_P4_CRU_ESCR3: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_REPLAY_EVENT) = P4_OPCODE_PACK(0x09, 0x05), + /* + * MSR_P4_CRU_ESCR2: 12, 13, 16 + * MSR_P4_CRU_ESCR3: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_INSTR_RETIRED) = P4_OPCODE_PACK(0x02, 0x04), + /* + * MSR_P4_CRU_ESCR0: 12, 13, 16 + * MSR_P4_CRU_ESCR1: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_UOPS_RETIRED) = P4_OPCODE_PACK(0x01, 0x04), + /* + * MSR_P4_CRU_ESCR0: 12, 13, 16 + * MSR_P4_CRU_ESCR1: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_UOP_TYPE) = P4_OPCODE_PACK(0x02, 0x02), + /* + * MSR_P4_RAT_ESCR0: 12, 13, 16 + * MSR_P4_RAT_ESCR1: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_BRANCH_RETIRED) = P4_OPCODE_PACK(0x06, 0x05), + /* + * MSR_P4_CRU_ESCR2: 12, 13, 16 + * MSR_P4_CRU_ESCR3: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED) = P4_OPCODE_PACK(0x03, 0x04), + /* + * MSR_P4_CRU_ESCR0: 12, 13, 16 + * MSR_P4_CRU_ESCR1: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_X87_ASSIST) = P4_OPCODE_PACK(0x03, 0x05), + /* + * MSR_P4_CRU_ESCR2: 12, 13, 16 + * MSR_P4_CRU_ESCR3: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_MACHINE_CLEAR) = P4_OPCODE_PACK(0x02, 0x05), + /* + * MSR_P4_CRU_ESCR2: 12, 13, 16 + * MSR_P4_CRU_ESCR3: 14, 15, 17 + */ + + P4_OPCODE(P4_EVENT_INSTR_COMPLETED) = P4_OPCODE_PACK(0x07, 0x04), + /* + * MSR_P4_CRU_ESCR0: 12, 13, 16 + * MSR_P4_CRU_ESCR1: 14, 15, 17 + */ +}; + +/* + * a caller should use P4_ESCR_EMASK_NAME helper to + * pick the EventMask needed, for example + * + * P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) + */ +enum P4_ESCR_EMASKS { + P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DD, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DB, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, DI, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BD, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BB, 4), + P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, BI, 5), + P4_GEN_ESCR_EMASK(P4_EVENT_TC_DELIVER_MODE, ID, 6), + + P4_GEN_ESCR_EMASK(P4_EVENT_BPU_FETCH_REQUEST, TCMISS, 0), + + P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, MISS, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_ITLB_REFERENCE, HIT_UK, 2), + + P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_CANCEL, 64K_CONF, 3), + + P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, LSC, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_MEMORY_COMPLETE, SSC, 1), + + P4_GEN_ESCR_EMASK(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD, 1), + + P4_GEN_ESCR_EMASK(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST, 1), + + P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STA, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, NO_STD, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA, 4), + P4_GEN_ESCR_EMASK(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR, 5), + + P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, DTMISS, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_PAGE_WALK_TYPE, ITMISS, 1), + + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE, 4), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM, 5), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS, 8), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS, 9), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS, 10), + + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, DEFAULT, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_READ, 5), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE, 6), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_UC, 7), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WC, 8), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WT, 9), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WP, 10), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, MEM_WB, 11), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OWN, 13), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, OTHER, 14), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ALLOCATION, PREFETCH, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ, 5), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE, 6), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC, 7), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC, 8), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT, 9), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP, 10), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB, 11), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN, 13), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER, 14), + P4_GEN_ESCR_EMASK(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN, 4), + P4_GEN_ESCR_EMASK(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER, 5), + + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE, 5), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE, 6), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE, 7), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE, 8), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE, 9), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE, 10), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0, 11), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1, 12), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2, 13), + + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE, 5), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE, 6), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE, 7), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE, 8), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE, 9), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE, 10), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0, 11), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1, 12), + P4_GEN_ESCR_EMASK(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2, 13), + + P4_GEN_ESCR_EMASK(P4_EVENT_SSE_INPUT_ASSIST, ALL, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_SP_UOP, ALL, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_PACKED_DP_UOP, ALL, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_SP_UOP, ALL, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_SCALAR_DP_UOP, ALL, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_64BIT_MMX_UOP, ALL, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_128BIT_MMX_UOP, ALL, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_X87_FP_UOP, ALL, 15), + + P4_GEN_ESCR_EMASK(P4_EVENT_TC_MISC, FLUSH, 4), + + P4_GEN_ESCR_EMASK(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING, 0), + + P4_GEN_ESCR_EMASK(P4_EVENT_TC_MS_XFER, CISC, 0), + + P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM, 2), + + P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT, 4), + + P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, CALL, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT, 4), + + P4_GEN_ESCR_EMASK(P4_EVENT_RESOURCE_STALL, SBFULL, 5), + + P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_EVICTS, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS, 1), + + P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, NBOGUS, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_FRONT_END_EVENT, BOGUS, 1), + + P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS0, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS1, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS2, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, NBOGUS3, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS0, 4), + P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS1, 5), + P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS2, 6), + P4_GEN_ESCR_EMASK(P4_EVENT_EXECUTION_EVENT, BOGUS3, 7), + + P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, NBOGUS, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_REPLAY_EVENT, BOGUS, 1), + + P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, NBOGUSTAG, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSNTAG, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_RETIRED, BOGUSTAG, 3), + + P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, NBOGUS, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_UOPS_RETIRED, BOGUS, 1), + + P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGLOADS, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_UOP_TYPE, TAGSTORES, 2), + + P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNP, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMNM, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTP, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_BRANCH_RETIRED, MMTM, 3), + + P4_GEN_ESCR_EMASK(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS, 0), + + P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSU, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, FPSO, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAO, 2), + P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, POAU, 3), + P4_GEN_ESCR_EMASK(P4_EVENT_X87_ASSIST, PREA, 4), + + P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, CLEAR, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, MOCLEAR, 1), + P4_GEN_ESCR_EMASK(P4_EVENT_MACHINE_CLEAR, SMCLEAR, 2), + + P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, NBOGUS, 0), + P4_GEN_ESCR_EMASK(P4_EVENT_INSTR_COMPLETED, BOGUS, 1), +}; + +/* + * Note we have UOP and PEBS bits reserved for now + * just in case if we will need them once + */ +#define P4_PEBS_CONFIG_ENABLE (1 << 7) +#define P4_PEBS_CONFIG_UOP_TAG (1 << 8) +#define P4_PEBS_CONFIG_METRIC_MASK 0x3f +#define P4_PEBS_CONFIG_MASK 0xff + +/* + * mem: Only counters MSR_IQ_COUNTER4 (16) and + * MSR_IQ_COUNTER5 (17) are allowed for PEBS sampling + */ +#define P4_PEBS_ENABLE 0x02000000U +#define P4_PEBS_ENABLE_UOP_TAG 0x01000000U + +#define p4_config_unpack_metric(v) (((u64)(v)) & P4_PEBS_CONFIG_METRIC_MASK) +#define p4_config_unpack_pebs(v) (((u64)(v)) & P4_PEBS_CONFIG_MASK) + +#define p4_config_pebs_has(v, mask) (p4_config_unpack_pebs(v) & (mask)) + +enum P4_PEBS_METRIC { + P4_PEBS_METRIC__none, + + P4_PEBS_METRIC__1stl_cache_load_miss_retired, + P4_PEBS_METRIC__2ndl_cache_load_miss_retired, + P4_PEBS_METRIC__dtlb_load_miss_retired, + P4_PEBS_METRIC__dtlb_store_miss_retired, + P4_PEBS_METRIC__dtlb_all_miss_retired, + P4_PEBS_METRIC__tagged_mispred_branch, + P4_PEBS_METRIC__mob_load_replay_retired, + P4_PEBS_METRIC__split_load_retired, + P4_PEBS_METRIC__split_store_retired, + + P4_PEBS_METRIC__max +}; + +/* + * Notes on internal configuration of ESCR+CCCR tuples + * + * Since P4 has quite the different architecture of + * performance registers in compare with "architectural" + * once and we have on 64 bits to keep configuration + * of performance event, the following trick is used. + * + * 1) Since both ESCR and CCCR registers have only low + * 32 bits valuable, we pack them into a single 64 bit + * configuration. Low 32 bits of such config correspond + * to low 32 bits of CCCR register and high 32 bits + * correspond to low 32 bits of ESCR register. + * + * 2) The meaning of every bit of such config field can + * be found in Intel SDM but it should be noted that + * we "borrow" some reserved bits for own usage and + * clean them or set to a proper value when we do + * a real write to hardware registers. + * + * 3) The format of bits of config is the following + * and should be either 0 or set to some predefined + * values: + * + * Low 32 bits + * ----------- + * 0-6: P4_PEBS_METRIC enum + * 7-11: reserved + * 12: reserved (Enable) + * 13-15: reserved (ESCR select) + * 16-17: Active Thread + * 18: Compare + * 19: Complement + * 20-23: Threshold + * 24: Edge + * 25: reserved (FORCE_OVF) + * 26: reserved (OVF_PMI_T0) + * 27: reserved (OVF_PMI_T1) + * 28-29: reserved + * 30: reserved (Cascade) + * 31: reserved (OVF) + * + * High 32 bits + * ------------ + * 0: reserved (T1_USR) + * 1: reserved (T1_OS) + * 2: reserved (T0_USR) + * 3: reserved (T0_OS) + * 4: Tag Enable + * 5-8: Tag Value + * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper) + * 25-30: enum P4_EVENTS + * 31: reserved (HT thread) + */ + +#endif /* PERF_EVENT_P4_H */ + diff -uprN linux-2.6.32/arch/x86/include/asm/perf_regs.h linux-2.6.32.ovz/arch/x86/include/asm/perf_regs.h --- linux-2.6.32/arch/x86/include/asm/perf_regs.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/perf_regs.h 2015-03-10 13:24:07.000000000 -0700 @@ -0,0 +1,33 @@ +#ifndef _ASM_X86_PERF_REGS_H +#define _ASM_X86_PERF_REGS_H + +enum perf_event_x86_regs { + PERF_REG_X86_AX, + PERF_REG_X86_BX, + PERF_REG_X86_CX, + PERF_REG_X86_DX, + PERF_REG_X86_SI, + PERF_REG_X86_DI, + PERF_REG_X86_BP, + PERF_REG_X86_SP, + PERF_REG_X86_IP, + PERF_REG_X86_FLAGS, + PERF_REG_X86_CS, + PERF_REG_X86_SS, + PERF_REG_X86_DS, + PERF_REG_X86_ES, + PERF_REG_X86_FS, + PERF_REG_X86_GS, + PERF_REG_X86_R8, + PERF_REG_X86_R9, + PERF_REG_X86_R10, + PERF_REG_X86_R11, + PERF_REG_X86_R12, + PERF_REG_X86_R13, + PERF_REG_X86_R14, + PERF_REG_X86_R15, + + PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1, + PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1, +}; +#endif /* _ASM_X86_PERF_REGS_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/pgalloc.h linux-2.6.32.ovz/arch/x86/include/asm/pgalloc.h --- linux-2.6.32/arch/x86/include/asm/pgalloc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pgalloc.h 2015-03-10 13:21:16.000000000 -0700 @@ -23,6 +23,11 @@ static inline void paravirt_release_pud( #endif /* + * Flags to use when allocating a user page table page. + */ +extern gfp_t __userpte_alloc_gfp; + +/* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); diff -uprN linux-2.6.32/arch/x86/include/asm/pgtable-2level.h linux-2.6.32.ovz/arch/x86/include/asm/pgtable-2level.h --- linux-2.6.32/arch/x86/include/asm/pgtable-2level.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pgtable-2level.h 2015-03-10 13:21:12.000000000 -0700 @@ -13,11 +13,13 @@ */ static inline void native_set_pte(pte_t *ptep , pte_t pte) { + mm_track_pte(ptep); *ptep = pte; } static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { + mm_track_pmd(pmdp); *pmdp = pmd; } @@ -34,12 +36,14 @@ static inline void native_pmd_clear(pmd_ static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) { + mm_track_pte(xp); *xp = native_make_pte(0); } #ifdef CONFIG_SMP static inline pte_t native_ptep_get_and_clear(pte_t *xp) { + mm_track_pte(xp); return __pte(xchg(&xp->pte_low, 0)); } #else diff -uprN linux-2.6.32/arch/x86/include/asm/pgtable_32.h linux-2.6.32.ovz/arch/x86/include/asm/pgtable_32.h --- linux-2.6.32/arch/x86/include/asm/pgtable_32.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pgtable_32.h 2015-03-10 13:22:14.000000000 -0700 @@ -22,11 +22,13 @@ #include #include #include +#include struct mm_struct; struct vm_area_struct; extern pgd_t swapper_pg_dir[1024]; +extern pgd_t trampoline_pg_dir[1024]; static inline void pgtable_cache_init(void) { } static inline void check_pgt_cache(void) { } @@ -54,10 +56,10 @@ extern void set_pmd_pfn(unsigned long, u in_irq() ? KM_IRQ_PTE : \ KM_PTE0) #define pte_offset_map(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ pte_index((address))) #define pte_offset_map_nested(dir, address) \ - ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ + ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \ pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) #define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) diff -uprN linux-2.6.32/arch/x86/include/asm/pgtable-3level.h linux-2.6.32.ovz/arch/x86/include/asm/pgtable-3level.h --- linux-2.6.32/arch/x86/include/asm/pgtable-3level.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pgtable-3level.h 2015-03-10 13:23:15.000000000 -0700 @@ -26,23 +26,78 @@ */ static inline void native_set_pte(pte_t *ptep, pte_t pte) { + mm_track_pte(ptep); ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; } +#define __HAVE_ARCH_READ_PMD_ATOMIC +/* + * pte_offset_map_lock on 32bit PAE kernels was reading the pmd_t with + * a "*pmdp" dereference done by gcc. Problem is, in certain places + * where pte_offset_map_lock is called, concurrent page faults are + * allowed, if the mmap_sem is hold for reading. An example is mincore + * vs page faults vs MADV_DONTNEED. On the page fault side + * pmd_populate rightfully does a set_64bit, but if we're reading the + * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen + * because gcc will not read the 64bit of the pmd atomically. To fix + * this all places running pmd_offset_map_lock() while holding the + * mmap_sem in read mode, shall read the pmdp pointer using this + * function to know if the pmd is null nor not, and in turn to know if + * they can run pmd_offset_map_lock or pmd_trans_huge or other pmd + * operations. + * + * Without THP if the mmap_sem is hold for reading, the + * pmd can only transition from null to not null while read_pmd_atomic runs. + * So there's no need of literally reading it atomically. + * + * With THP if the mmap_sem is hold for reading, the pmd can become + * THP or null or point to a pte (and in turn become "stable") at any + * time under read_pmd_atomic, so it's mandatory to read it atomically + * with cmpxchg8b. + */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t read_pmd_atomic(pmd_t *pmdp) +{ + pmdval_t ret; + u32 *tmp = (u32 *)pmdp; + + ret = (pmdval_t) (*tmp); + if (ret) { + /* + * If the low part is null, we must not read the high part + * or we can end up with a partial pmd. + */ + smp_rmb(); + ret |= ((pmdval_t)*(tmp + 1)) << 32; + } + + return (pmd_t) { ret }; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline pmd_t read_pmd_atomic(pmd_t *pmdp) +{ + pmdval_t val = (pmdval_t)atomic64_read((atomic64_t *)pmdp); + return (pmd_t) { val }; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) { + mm_track_pte(ptep); set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); } static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { + mm_track_pmd(pmdp); set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd)); } static inline void native_set_pud(pud_t *pudp, pud_t pud) { + mm_track_pud(pudp); set_64bit((unsigned long long *)(pudp), native_pud_val(pud)); } @@ -54,6 +109,7 @@ static inline void native_set_pud(pud_t static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { + mm_track_pte(ptep); ptep->pte_low = 0; smp_wmb(); ptep->pte_high = 0; @@ -62,6 +118,9 @@ static inline void native_pte_clear(stru static inline void native_pmd_clear(pmd_t *pmd) { u32 *tmp = (u32 *)pmd; + + mm_track_pmd(pmd); + *tmp = 0; smp_wmb(); *(tmp + 1) = 0; @@ -69,8 +128,8 @@ static inline void native_pmd_clear(pmd_ static inline void pud_clear(pud_t *pudp) { - unsigned long pgd; + mm_track_pud(pudp); set_pud(pudp, __pud(0)); /* @@ -79,13 +138,10 @@ static inline void pud_clear(pud_t *pudp * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... * - * Make sure the pud entry we're updating is within the - * current pgd to avoid unnecessary TLB flushes. - */ - pgd = read_cr3(); - if (__pa(pudp) >= pgd && __pa(pudp) < - (pgd + sizeof(pgd_t)*PTRS_PER_PGD)) - write_cr3(pgd); + * Currently all places where pud_clear() is called either have + * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or + * pud_clear_bad()), so we don't need TLB flush here. + */ } #ifdef CONFIG_SMP @@ -93,6 +149,8 @@ static inline pte_t native_ptep_get_and_ { pte_t res; + mm_track_pte(ptep); + /* xchg acts as a barrier before the setting of the high bits */ res.pte_low = xchg(&ptep->pte_low, 0); res.pte_high = ptep->pte_high; diff -uprN linux-2.6.32/arch/x86/include/asm/pgtable_64.h linux-2.6.32.ovz/arch/x86/include/asm/pgtable_64.h --- linux-2.6.32/arch/x86/include/asm/pgtable_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pgtable_64.h 2015-03-10 13:22:27.000000000 -0700 @@ -13,6 +13,7 @@ #include #include #include +#include extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; @@ -46,11 +47,13 @@ void set_pte_vaddr_pud(pud_t *pud_page, static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { + mm_track_pte(ptep); *ptep = native_make_pte(0); } static inline void native_set_pte(pte_t *ptep, pte_t pte) { + mm_track_pte(ptep); *ptep = pte; } @@ -62,11 +65,14 @@ static inline void native_set_pte_atomic static inline pte_t native_ptep_get_and_clear(pte_t *xp) { #ifdef CONFIG_SMP + mm_track_pte(xp); return native_make_pte(xchg(&xp->pte, 0)); #else /* native_local_ptep_get_and_clear, but duplicated because of cyclic dependency */ - pte_t ret = *xp; + pte_t ret; + mm_track_pte(xp); + ret = *xp; native_pte_clear(NULL, 0, xp); return ret; #endif @@ -74,6 +80,7 @@ static inline pte_t native_ptep_get_and_ static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) { + mm_track_pmd(pmdp); *pmdp = pmd; } @@ -82,8 +89,22 @@ static inline void native_pmd_clear(pmd_ native_set_pmd(pmd, native_make_pmd(0)); } +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) +{ +#ifdef CONFIG_SMP + return native_make_pmd(xchg(&xp->pmd, 0)); +#else + /* native_local_pmdp_get_and_clear, + but duplicated because of cyclic dependency */ + pmd_t ret = *xp; + native_pmd_clear(xp); + return ret; +#endif +} + static inline void native_set_pud(pud_t *pudp, pud_t pud) { + mm_track_pud(pudp); *pudp = pud; } @@ -94,6 +115,7 @@ static inline void native_pud_clear(pud_ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { + mm_track_pgd(pgdp); *pgdp = pgd; } @@ -168,6 +190,121 @@ extern void cleanup_highmap(void); #define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) #define __HAVE_ARCH_PTE_SAME + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_SPLITTING; +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PSE; +} + +static inline int has_transparent_hugepage(void) +{ + return cpu_has_pse; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + pmd_update(mm, addr, pmdp); + return pmd; +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)&pmdp->pmd); + pmd_update(mm, addr, pmdp); +} + +static inline int pmd_young(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_ACCESSED; +} + +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return native_make_pmd(v & ~clear); +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DIRTY); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_PSE); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/pgtable.h linux-2.6.32.ovz/arch/x86/include/asm/pgtable.h --- linux-2.6.32/arch/x86/include/asm/pgtable.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pgtable.h 2015-03-10 13:23:28.000000000 -0700 @@ -16,6 +16,8 @@ #ifndef __ASSEMBLY__ +#include + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -26,11 +28,14 @@ extern unsigned long empty_zero_page[PAG extern spinlock_t pgd_lock; extern struct list_head pgd_list; +extern struct mm_struct *pgd_page_get_mm(struct page *page); + #ifdef CONFIG_PARAVIRT #include #else /* !CONFIG_PARAVIRT */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) +#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -55,6 +60,8 @@ extern struct list_head pgd_list; #define pte_update(mm, addr, ptep) do { } while (0) #define pte_update_defer(mm, addr, ptep) do { } while (0) +#define pmd_update(mm, addr, ptep) do { } while (0) +#define pmd_update_defer(mm, addr, ptep) do { } while (0) #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) @@ -82,7 +89,7 @@ extern struct list_head pgd_list; */ static inline int pte_dirty(pte_t pte) { - return pte_flags(pte) & _PAGE_DIRTY; + return pte_flags(pte) & (_PAGE_DIRTY | _PAGE_SOFTDIRTY); } static inline int pte_young(pte_t pte) @@ -134,8 +141,7 @@ static inline unsigned long pmd_pfn(pmd_ static inline int pmd_large(pmd_t pte) { - return (pmd_flags(pte) & (_PAGE_PSE | _PAGE_PRESENT)) == - (_PAGE_PSE | _PAGE_PRESENT); + return pmd_flags(pte) & _PAGE_PSE; } static inline pte_t pte_set_flags(pte_t pte, pteval_t set) @@ -154,7 +160,7 @@ static inline pte_t pte_clear_flags(pte_ static inline pte_t pte_mkclean(pte_t pte) { - return pte_clear_flags(pte, _PAGE_DIRTY); + return pte_clear_flags(pte, (_PAGE_DIRTY | _PAGE_SOFTDIRTY)); } static inline pte_t pte_mkold(pte_t pte) @@ -252,6 +258,16 @@ static inline pte_t pte_modify(pte_t pte return __pte(val); } +static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) +{ + pmdval_t val = pmd_val(pmd); + + val &= _HPAGE_CHG_MASK; + val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK; + + return __pmd(val); +} + /* mprotect needs to preserve PAT bits when updating vm_page_prot */ #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) @@ -270,9 +286,9 @@ static inline int is_new_memtype_allowed unsigned long new_flags) { /* - * PAT type is always WB for ISA. So no need to check. + * PAT type is always WB for untracked ranges, so no need to check. */ - if (is_ISA_range(paddr, paddr + size - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + size)) return 1; /* @@ -295,15 +311,15 @@ pmd_t *populate_extra_pmd(unsigned long pte_t *populate_extra_pte(unsigned long vaddr); #endif /* __ASSEMBLY__ */ +#ifndef __ASSEMBLY__ +#include + #ifdef CONFIG_X86_32 # include "pgtable_32.h" #else # include "pgtable_64.h" #endif -#ifndef __ASSEMBLY__ -#include - static inline int pte_none(pte_t pte) { return !pte.pte; @@ -327,7 +343,13 @@ static inline int pte_hidden(pte_t pte) static inline int pmd_present(pmd_t pmd) { - return pmd_flags(pmd) & _PAGE_PRESENT; + /* + * Checking for _PAGE_PSE is needed too because + * split_huge_page will temporarily clear the present bit (but + * the _PAGE_PSE flag will remain set at all times while the + * _PAGE_PRESENT bit is clear). + */ + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); } static inline int pmd_none(pmd_t pmd) @@ -346,7 +368,7 @@ static inline unsigned long pmd_page_vad * Currently stuck as a macro due to indirect forward reference to * linux/mmzone.h's __section_mem_map_addr() definition: */ -#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) +#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT) /* * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] @@ -526,6 +548,12 @@ static inline void native_set_pte_at(str native_set_pte(ptep, pte); } +static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp , pmd_t pmd) +{ + native_set_pmd(pmdp, pmd); +} + #ifndef CONFIG_PARAVIRT /* * Rules for using pte_update - it must be called after any PTE update which diff -uprN linux-2.6.32/arch/x86/include/asm/pgtable_types.h linux-2.6.32.ovz/arch/x86/include/asm/pgtable_types.h --- linux-2.6.32/arch/x86/include/asm/pgtable_types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pgtable_types.h 2015-03-10 13:22:22.000000000 -0700 @@ -22,6 +22,8 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 +#define _PAGE_BIT_SOFTDIRTY _PAGE_BIT_HIDDEN +#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ /* If _PAGE_BIT_PRESENT is clear, we use these: */ @@ -45,6 +47,8 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) +#define _PAGE_SOFTDIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFTDIRTY) +#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) #define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_KMEMCHECK @@ -69,7 +73,9 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ - _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) + _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ + _PAGE_SOFTDIRTY) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) diff -uprN linux-2.6.32/arch/x86/include/asm/probe_roms.h linux-2.6.32.ovz/arch/x86/include/asm/probe_roms.h --- linux-2.6.32/arch/x86/include/asm/probe_roms.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/probe_roms.h 2015-03-10 13:22:14.000000000 -0700 @@ -0,0 +1,8 @@ +#ifndef _PROBE_ROMS_H_ +#define _PROBE_ROMS_H_ +struct pci_dev; + +extern void __iomem *pci_map_biosrom(struct pci_dev *pdev); +extern void pci_unmap_biosrom(void __iomem *rom); +extern size_t pci_biosrom_size(struct pci_dev *pdev); +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/processor-flags.h linux-2.6.32.ovz/arch/x86/include/asm/processor-flags.h --- linux-2.6.32/arch/x86/include/asm/processor-flags.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/processor-flags.h 2015-03-10 13:23:40.000000000 -0700 @@ -43,6 +43,7 @@ */ #define X86_CR3_PWT 0x00000008 /* Page Write Through */ #define X86_CR3_PCD 0x00000010 /* Page Cache Disable */ +#define X86_CR3_PCID_MASK 0x00000fff /* PCID Mask */ /* * Intel CPU features in CR4 @@ -59,7 +60,10 @@ #define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */ #define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ #define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ +#define X86_CR4_RDWRGSFS 0x00010000 /* enable RDWRGSFS support */ +#define X86_CR4_PCIDE 0x00020000 /* enable PCID support */ #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ +#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */ /* * x86-64 Task Priority Register, CR8 diff -uprN linux-2.6.32/arch/x86/include/asm/processor.h linux-2.6.32.ovz/arch/x86/include/asm/processor.h --- linux-2.6.32/arch/x86/include/asm/processor.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/processor.h 2015-06-23 04:16:16.000000000 -0700 @@ -21,7 +21,6 @@ struct mm_struct; #include #include #include -#include #include #include @@ -29,6 +28,7 @@ struct mm_struct; #include #include #include +#include /* * Default implementation of macro that returns current @@ -112,7 +112,22 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; #endif + /* RHEL6: deprecated and should use x86_hyper instead */ unsigned int x86_hyper_vendor; +#ifndef __GENKSYMS__ + /* RHEL6: + There are only 4 bytes of space before the end of this struct. */ +#ifdef CONFIG_SMP + /* Compute unit id */ + u8 compute_unit_id; +#endif /* CONFIG_SMP */ + u32 microcode; +#endif /* !__GENKSYMS__ */ +} __attribute__((__aligned__(SMP_CACHE_BYTES))); + +/* This struct never should be on a kabi whitelist. */ +struct cpuinfo_x86_rh { + __u32 x86_capability[RH_EXT_NCAPINTS]; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -126,23 +141,32 @@ struct cpuinfo_x86 { #define X86_VENDOR_UNKNOWN 0xff -#define X86_HYPER_VENDOR_NONE 0 -#define X86_HYPER_VENDOR_VMWARE 1 - /* * capabilities of CPUs */ extern struct cpuinfo_x86 boot_cpu_data; +extern struct cpuinfo_x86_rh boot_cpu_data_rh; extern struct cpuinfo_x86 new_cpu_data; +extern struct cpuinfo_x86_rh new_cpu_data_rh; extern struct tss_struct doublefault_tss; -extern __u32 cpu_caps_cleared[NCAPINTS]; -extern __u32 cpu_caps_set[NCAPINTS]; +extern __u32 cpu_caps_cleared[RHNCAPINTS]; +extern __u32 cpu_caps_set[RHNCAPINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); +DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86_rh, cpu_info_rh); #define cpu_data(cpu) per_cpu(cpu_info, cpu) #define current_cpu_data __get_cpu_var(cpu_info) +#define cpu_data_rh(cpu) per_cpu(cpu_info_rh, cpu) +#define current_cpu_data_rh __get_cpu_var(cpu_info_rh) +static inline struct cpuinfo_x86_rh *get_cpuinfo_x86_rh(struct cpuinfo_x86 *c) +{ + if (c == &boot_cpu_data) + return &boot_cpu_data_rh; + else + return &cpu_data_rh(c->cpu_index); +} #else #define cpu_data(cpu) boot_cpu_data #define current_cpu_data boot_cpu_data @@ -161,6 +185,9 @@ static inline int hlt_works(int cpu) #define cache_line_size() (boot_cpu_data.x86_cache_alignment) +#define __HAVE_ARCH_ALIGN_STACK +extern unsigned long arch_align_stack(unsigned long sp); + extern void cpu_detect(struct cpuinfo_x86 *c); extern struct pt_regs *idle_regs(struct pt_regs *); @@ -180,12 +207,13 @@ static inline void native_cpuid(unsigned unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ - asm("cpuid" + asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (*eax), "2" (*ecx)); + : "0" (*eax), "2" (*ecx) + : "memory"); } static inline void load_cr3(pgd_t *pgdir) @@ -379,6 +407,10 @@ union thread_xstate { struct xsave_struct xsave; }; +struct fpu { + union thread_xstate *state; +}; + #ifdef CONFIG_X86_64 DECLARE_PER_CPU(struct orig_ist, orig_ist); @@ -471,10 +503,6 @@ struct thread_struct { unsigned long iopl; /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; -/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ - unsigned long debugctlmsr; - /* Debug Store context; see asm/ds.h */ - struct ds_context *ds_ctx; }; static inline unsigned long native_get_debugreg(int regno) @@ -765,29 +793,6 @@ extern unsigned long boot_option_idle_o extern unsigned long idle_halt; extern unsigned long idle_nomwait; -/* - * on systems with caches, caches must be flashed as the absolute - * last instruction before going into a suspended halt. Otherwise, - * dirty data can linger in the cache and become stale on resume, - * leading to strange errors. - * - * perform a variety of operations to guarantee that the compiler - * will not reorder instructions. wbinvd itself is serializing - * so the processor will not reorder. - * - * Systems without cache can just go into halt. - */ -static inline void wbinvd_halt(void) -{ - mb(); - /* check for clflush to determine if wbinvd is legal */ - if (cpu_has_clflush) - asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory"); - else - while (1) - halt(); -} - extern void enable_sep_cpu(void); extern int sysenter_setup(void); @@ -801,7 +806,7 @@ extern void cpu_init(void); static inline unsigned long get_debugctlmsr(void) { - unsigned long debugctlmsr = 0; + unsigned long debugctlmsr = 0; #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) @@ -809,21 +814,6 @@ static inline unsigned long get_debugctl #endif rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); - return debugctlmsr; -} - -static inline unsigned long get_debugctlmsr_on_cpu(int cpu) -{ - u64 debugctlmsr = 0; - u32 val1, val2; - -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return 0; -#endif - rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2); - debugctlmsr = val1 | ((u64)val2 << 32); - return debugctlmsr; } @@ -836,18 +826,6 @@ static inline void update_debugctlmsr(un wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } -static inline void update_debugctlmsr_on_cpu(int cpu, - unsigned long debugctlmsr) -{ -#ifndef CONFIG_X86_DEBUGCTLMSR - if (boot_cpu_data.x86 < 6) - return; -#endif - wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, - (u32)((u64)debugctlmsr), - (u32)((u64)debugctlmsr >> 32)); -} - /* * from system description table in BIOS. Mostly for MCA use, but * others may find it useful: @@ -974,8 +952,7 @@ extern unsigned long thread_saved_pc(str /* This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \ - 0xc0000000 : 0xFFFFe000) +#define IA32_PAGE_OFFSET 0xc0000000 #define TASK_SIZE (test_thread_flag(TIF_IA32) ? \ IA32_PAGE_OFFSET : TASK_SIZE_MAX) @@ -1052,4 +1029,24 @@ unsigned long calc_aperfmperf_ratio(stru return ratio; } +static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) +{ + uint32_t base, eax, signature[3]; + + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); + + if (!memcmp(sig, signature, 12) && + (leaves == 0 || ((eax - base) >= leaves))) + return base; + } + + return 0; +} + +extern void (*set_cpuid_faulting_cb)(bool enable); +extern void set_cpuid_faulting(bool enable); + +extern void get_cpu_cap_masked(u32 *val); + #endif /* _ASM_X86_PROCESSOR_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/proto.h linux-2.6.32.ovz/arch/x86/include/asm/proto.h --- linux-2.6.32/arch/x86/include/asm/proto.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/proto.h 2015-03-10 13:22:25.000000000 -0700 @@ -22,14 +22,4 @@ extern int reboot_force; long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); -/* - * This looks more complex than it should be. But we need to - * get the type for the ~ right in round_down (it needs to be - * as wide as the result!), and we want to evaluate the macro - * arguments just once each. - */ -#define __round_mask(x,y) ((__typeof__(x))((y)-1)) -#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1) -#define round_down(x,y) ((x) & ~__round_mask(x,y)) - #endif /* _ASM_X86_PROTO_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/ptrace-abi.h linux-2.6.32.ovz/arch/x86/include/asm/ptrace-abi.h --- linux-2.6.32/arch/x86/include/asm/ptrace-abi.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/ptrace-abi.h 2015-03-10 13:21:31.000000000 -0700 @@ -82,61 +82,6 @@ #ifndef __ASSEMBLY__ #include - -/* configuration/status structure used in PTRACE_BTS_CONFIG and - PTRACE_BTS_STATUS commands. -*/ -struct ptrace_bts_config { - /* requested or actual size of BTS buffer in bytes */ - __u32 size; - /* bitmask of below flags */ - __u32 flags; - /* buffer overflow signal */ - __u32 signal; - /* actual size of bts_struct in bytes */ - __u32 bts_size; -}; -#endif /* __ASSEMBLY__ */ - -#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */ -#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */ -#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG on buffer overflow - instead of wrapping around */ -#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */ - -#define PTRACE_BTS_CONFIG 40 -/* Configure branch trace recording. - ADDR points to a struct ptrace_bts_config. - DATA gives the size of that buffer. - A new buffer is allocated, if requested in the flags. - An overflow signal may only be requested for new buffers. - Returns the number of bytes read. -*/ -#define PTRACE_BTS_STATUS 41 -/* Return the current configuration in a struct ptrace_bts_config - pointed to by ADDR; DATA gives the size of that buffer. - Returns the number of bytes written. -*/ -#define PTRACE_BTS_SIZE 42 -/* Return the number of available BTS records for draining. - DATA and ADDR are ignored. -*/ -#define PTRACE_BTS_GET 43 -/* Get a single BTS record. - DATA defines the index into the BTS array, where 0 is the newest - entry, and higher indices refer to older entries. - ADDR is pointing to struct bts_struct (see asm/ds.h). -*/ -#define PTRACE_BTS_CLEAR 44 -/* Clear the BTS buffer. - DATA and ADDR are ignored. -*/ -#define PTRACE_BTS_DRAIN 45 -/* Read all available BTS records and clear the buffer. - ADDR points to an array of struct bts_struct. - DATA gives the size of that buffer. - BTS records are read from oldest to newest. - Returns number of BTS records drained. -*/ +#endif #endif /* _ASM_X86_PTRACE_ABI_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/ptrace.h linux-2.6.32.ovz/arch/x86/include/asm/ptrace.h --- linux-2.6.32/arch/x86/include/asm/ptrace.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/ptrace.h 2015-06-23 04:16:16.000000000 -0700 @@ -7,6 +7,7 @@ #ifdef __KERNEL__ #include +#include #endif #ifndef __ASSEMBLY__ @@ -216,6 +217,67 @@ static inline unsigned long user_stack_p return regs->sp; } +/* Query offset/name of register from its name/offset */ +extern int regs_query_register_offset(const char *name); +extern const char *regs_query_register_name(unsigned int offset); +#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss)) + +/** + * regs_get_register() - get register value from its offset + * @regs: pt_regs from which register value is gotten. + * @offset: offset number of the register. + * + * regs_get_register returns the value of a register whose offset from @regs + * is @offset. The @offset is the offset of the register in struct pt_regs. + * If @offset is bigger than MAX_REG_OFFSET, this returns 0. + */ +static inline unsigned long regs_get_register(struct pt_regs *regs, + unsigned int offset) +{ + if (unlikely(offset > MAX_REG_OFFSET)) + return 0; + return *(unsigned long *)((unsigned long)regs + offset); +} + +/** + * regs_within_kernel_stack() - check the address in the stack + * @regs: pt_regs which contains kernel stack pointer. + * @addr: address which is checked. + * + * regs_within_kenel_stack() checks @addr is within the kernel stack page(s). + * If @addr is within the kernel stack, it returns true. If not, returns false. + */ +static inline int regs_within_kernel_stack(struct pt_regs *regs, + unsigned long addr) +{ + return ((addr & ~(THREAD_SIZE - 1)) == + (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))); +} + +/** + * regs_get_kernel_stack_nth() - get Nth entry of the stack + * @regs: pt_regs which contains kernel stack pointer. + * @n: stack entry number. + * + * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which + * is specifined by @regs. If the @n th entry is NOT in the kernel stack, + * this returns 0. + */ +static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, + unsigned int n) +{ + unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs); + addr += n; + if (regs_within_kernel_stack(regs, (unsigned long)addr)) + return *addr; + else + return 0; +} + +/* Get Nth argument at function call */ +extern unsigned long regs_get_argument_nth(struct pt_regs *regs, + unsigned int n); + /* * These are defined as per linux/ptrace.h, which see. */ @@ -230,18 +292,30 @@ extern void user_enable_block_step(struc #define arch_has_block_step() (boot_cpu_data.x86 >= 6) #endif +#define ARCH_HAS_USER_SINGLE_STEP_INFO + +/* + * When hitting ptrace_stop(), we cannot return using SYSRET because + * that does not restore the full CPU state, only a minimal set. The + * ptracer can change arbitrary register values, which is usually okay + * because the usual ptrace stops run off the signal delivery path which + * forces IRET; however, ptrace_event() stops happen in arbitrary places + * in the kernel and don't force IRET path. + * + * So force IRET path after a ptrace stop. + */ +#define arch_ptrace_stop_needed(code, info) \ +({ \ + set_thread_flag(TIF_NOTIFY_RESUME); \ + false; \ +}) + struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); -#ifdef CONFIG_X86_PTRACE_BTS -extern void ptrace_bts_untrace(struct task_struct *tsk); - -#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk) -#endif /* CONFIG_X86_PTRACE_BTS */ - #endif /* __KERNEL__ */ #endif /* !__ASSEMBLY__ */ diff -uprN linux-2.6.32/arch/x86/include/asm/pvclock-abi.h linux-2.6.32.ovz/arch/x86/include/asm/pvclock-abi.h --- linux-2.6.32/arch/x86/include/asm/pvclock-abi.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pvclock-abi.h 2015-03-10 13:23:40.000000000 -0700 @@ -29,7 +29,8 @@ struct pvclock_vcpu_time_info { u64 system_time; u32 tsc_to_system_mul; s8 tsc_shift; - u8 pad[3]; + u8 flags; + u8 pad[2]; } __attribute__((__packed__)); /* 32 bytes */ struct pvclock_wall_clock { @@ -38,5 +39,7 @@ struct pvclock_wall_clock { u32 nsec; } __attribute__((__packed__)); +#define PVCLOCK_TSC_STABLE_BIT (1 << 0) +#define PVCLOCK_GUEST_STOPPED (1 << 1) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PVCLOCK_ABI_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/pvclock.h linux-2.6.32.ovz/arch/x86/include/asm/pvclock.h --- linux-2.6.32/arch/x86/include/asm/pvclock.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/pvclock.h 2015-03-10 13:22:24.000000000 -0700 @@ -6,9 +6,54 @@ /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); +void pvclock_set_flags(u8 flags); unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src); void pvclock_read_wallclock(struct pvclock_wall_clock *wall, struct pvclock_vcpu_time_info *vcpu, struct timespec *ts); +void pvclock_resume(void); + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#else + ulong tmp; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif defined(__x86_64__) + __asm__ ( + "mul %[mul_frac] ; shrd $32, %[hi], %[lo]" + : [lo]"=a"(product), + [hi]"=d"(tmp) + : "0"(delta), + [mul_frac]"rm"((u64)mul_frac)); +#else +#error implement me! +#endif + + return product; +} #endif /* _ASM_X86_PVCLOCK_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/required-features.h linux-2.6.32.ovz/arch/x86/include/asm/required-features.h --- linux-2.6.32/arch/x86/include/asm/required-features.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/required-features.h 2015-03-10 13:22:35.000000000 -0700 @@ -84,5 +84,7 @@ #define REQUIRED_MASK5 0 #define REQUIRED_MASK6 0 #define REQUIRED_MASK7 0 +#define REQUIRED_MASK8 0 +#define REQUIRED_MASK9 0 #endif /* _ASM_X86_REQUIRED_FEATURES_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/rwsem.h linux-2.6.32.ovz/arch/x86/include/asm/rwsem.h --- linux-2.6.32/arch/x86/include/asm/rwsem.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/rwsem.h 2015-03-10 13:21:14.000000000 -0700 @@ -41,6 +41,7 @@ #include #include #include +#include struct rwsem_waiter; @@ -55,17 +56,28 @@ extern asmregparm struct rw_semaphore * /* * the semaphore definition + * + * The bias values and the counter type limits the number of + * potential readers/writers to 32767 for 32 bits and 2147483647 + * for 64 bits. */ -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) +#ifdef CONFIG_X86_64 +# define RWSEM_ACTIVE_MASK 0xffffffffL +#else +# define RWSEM_ACTIVE_MASK 0x0000ffffL +#endif + +#define RWSEM_UNLOCKED_VALUE 0x00000000L +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +typedef signed long rwsem_count_t; + struct rw_semaphore { - signed long count; + rwsem_count_t count; spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -105,7 +117,7 @@ do { \ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" - LOCK_PREFIX " incl (%%eax)\n\t" + LOCK_PREFIX _ASM_INC "(%1)\n\t" /* adds 0x00000001, returns the old value */ " jns 1f\n" " call call_rwsem_down_read_failed\n" @@ -121,14 +133,14 @@ static inline void __down_read(struct rw */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - __s32 result, tmp; + rwsem_count_t result, tmp; asm volatile("# beginning __down_read_trylock\n\t" - " movl %0,%1\n\t" + " mov %0,%1\n\t" "1:\n\t" - " movl %1,%2\n\t" - " addl %3,%2\n\t" + " mov %1,%2\n\t" + " add %3,%2\n\t" " jle 2f\n\t" - LOCK_PREFIX " cmpxchgl %2,%0\n\t" + LOCK_PREFIX " cmpxchg %2,%0\n\t" " jnz 1b\n\t" "2:\n\t" "# ending __down_read_trylock\n\t" @@ -143,13 +155,13 @@ static inline int __down_read_trylock(st */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - int tmp; + rwsem_count_t tmp; tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtract 0x0000ffff, returns the old value */ - " testl %%edx,%%edx\n\t" + " test %1,%1\n\t" /* was the count 0 before? */ " jz 1f\n" " call call_rwsem_down_write_failed\n" @@ -170,9 +182,9 @@ static inline void __down_write(struct r */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - signed long ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + rwsem_count_t ret = cmpxchg(&sem->count, + RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -183,9 +195,9 @@ static inline int __down_write_trylock(s */ static inline void __up_read(struct rw_semaphore *sem) { - __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ " jns 1f\n\t" " call call_rwsem_wake\n" @@ -201,18 +213,18 @@ static inline void __up_read(struct rw_s */ static inline void __up_write(struct rw_semaphore *sem) { + rwsem_count_t tmp; asm volatile("# beginning __up_write\n\t" - " movl %2,%%edx\n\t" - LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t" + LOCK_PREFIX " xadd %1,(%2)\n\t" /* tries to transition 0xffff0001 -> 0x00000000 */ " jz 1f\n" " call call_rwsem_wake\n" "1:\n\t" "# ending __up_write\n" - : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc", "edx"); + : "+m" (sem->count), "=d" (tmp) + : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS) + : "memory", "cc"); } /* @@ -221,33 +233,38 @@ static inline void __up_write(struct rw_ static inline void __downgrade_write(struct rw_semaphore *sem) { asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX " addl %2,(%%eax)\n\t" - /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ + LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" + /* + * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) + * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) + */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" "1:\n\t" "# ending __downgrade_write\n" : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_WAITING_BIAS) + : "a" (sem), "er" (-RWSEM_WAITING_BIAS) : "memory", "cc"); } /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(rwsem_count_t delta, + struct rw_semaphore *sem) { - asm volatile(LOCK_PREFIX "addl %1,%0" + asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) - : "ir" (delta)); + : "er" (delta)); } /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, + struct rw_semaphore *sem) { - int tmp = delta; + rwsem_count_t tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) diff -uprN linux-2.6.32/arch/x86/include/asm/sections.h linux-2.6.32.ovz/arch/x86/include/asm/sections.h --- linux-2.6.32/arch/x86/include/asm/sections.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/sections.h 2015-03-10 13:21:47.000000000 -0700 @@ -2,7 +2,15 @@ #define _ASM_X86_SECTIONS_H #include +#ifndef __GENKSYMS__ +#include +#endif extern char __brk_base[], __brk_limit[]; +extern struct exception_table_entry __stop___ex_table[]; + +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +extern char __end_rodata_hpage_align[]; +#endif #endif /* _ASM_X86_SECTIONS_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/setup.h linux-2.6.32.ovz/arch/x86/include/asm/setup.h --- linux-2.6.32/arch/x86/include/asm/setup.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/setup.h 2015-03-10 13:22:30.000000000 -0700 @@ -95,10 +95,11 @@ void *extend_brk(size_t size, size_t ali : : "i" (sz)); \ } +extern void probe_roms(void); + #ifdef __i386__ void __init i386_start_kernel(void); -extern void probe_roms(void); #else void __init x86_64_start_kernel(char *real_mode); diff -uprN linux-2.6.32/arch/x86/include/asm/smp.h linux-2.6.32.ovz/arch/x86/include/asm/smp.h --- linux-2.6.32/arch/x86/include/asm/smp.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/smp.h 2015-03-10 13:22:38.000000000 -0700 @@ -21,6 +21,15 @@ extern int smp_num_siblings; extern unsigned int num_processors; +static inline bool cpu_has_ht_siblings(void) +{ + bool has_siblings = false; +#ifdef CONFIG_SMP + has_siblings = cpu_has_ht && smp_num_siblings > 1; +#endif + return has_siblings; +} + DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); DECLARE_PER_CPU(u16, cpu_llc_id); @@ -50,7 +59,7 @@ struct smp_ops { void (*smp_prepare_cpus)(unsigned max_cpus); void (*smp_cpus_done)(unsigned max_cpus); - void (*smp_send_stop)(void); + void (*stop_other_cpus)(int wait); void (*smp_send_reschedule)(int cpu); int (*cpu_up)(unsigned cpu); @@ -73,7 +82,12 @@ extern struct smp_ops smp_ops; static inline void smp_send_stop(void) { - smp_ops.smp_send_stop(); + smp_ops.stop_other_cpus(0); +} + +static inline void stop_other_cpus(void) +{ + smp_ops.stop_other_cpus(1); } static inline void smp_prepare_boot_cpu(void) @@ -135,6 +149,8 @@ int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); void native_play_dead(void); void play_dead_common(void); +void wbinvd_on_cpu(int cpu); +int wbinvd_on_all_cpus(void); void native_send_call_func_ipi(const struct cpumask *mask); void native_send_call_func_single_ipi(int cpu); @@ -147,6 +163,13 @@ static inline int num_booting_cpus(void) { return cpumask_weight(cpu_callout_mask); } +#else /* !CONFIG_SMP */ +#define wbinvd_on_cpu(cpu) wbinvd() +static inline int wbinvd_on_all_cpus (void) +{ + wbinvd(); + return 0; +} #endif /* CONFIG_SMP */ extern unsigned disabled_cpus __cpuinitdata; diff -uprN linux-2.6.32/arch/x86/include/asm/spinlock.h linux-2.6.32.ovz/arch/x86/include/asm/spinlock.h --- linux-2.6.32/arch/x86/include/asm/spinlock.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/spinlock.h 2015-03-10 13:21:33.000000000 -0700 @@ -34,8 +34,10 @@ * (PPro errata 66, 92) */ # define UNLOCK_LOCK_PREFIX LOCK_PREFIX +# define UNLOCK_LOCK_ALT_PREFIX "nop;" #else # define UNLOCK_LOCK_PREFIX +# define UNLOCK_LOCK_ALT_PREFIX #endif /* @@ -60,19 +62,27 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) { - short inc = 0x0100; + short inc; asm volatile ( + "1:\t\n" + "mov $0x100, %0\n\t" LOCK_PREFIX "xaddw %w0, %1\n" - "1:\t" + "2:\t" "cmpb %h0, %b0\n\t" - "je 2f\n\t" + "je 4f\n\t" + "3:\t\n" "rep ; nop\n\t" + ALTERNATIVE( "movb %1, %b0\n\t" /* don't need lfence here, because loads are in-order */ - "jmp 1b\n" - "2:" - : "+Q" (inc), "+m" (lock->slock) + "jmp 2b\n", + "", X86_FEATURE_UNFAIR_SPINLOCK)"\n\t" + "cmpw $0, %1\n\t" + "jne 3b\n\t" + "jmp 1b\n\t" + "4:" + : "=Q" (inc), "+m" (lock->slock) : : "memory", "cc"); } @@ -98,31 +108,41 @@ static __always_inline int __ticket_spin static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) { - asm volatile(UNLOCK_LOCK_PREFIX "incb %0" - : "+m" (lock->slock) - : - : "memory", "cc"); + asm volatile( + ALTERNATIVE(UNLOCK_LOCK_PREFIX"incb (%0);"ASM_NOP3, + UNLOCK_LOCK_ALT_PREFIX"movw $0, (%0)", + X86_FEATURE_UNFAIR_SPINLOCK) + : + : "Q" (&lock->slock) + : "memory", "cc"); } #else #define TICKET_SHIFT 16 static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) { - int inc = 0x00010000; + int inc; int tmp; - asm volatile(LOCK_PREFIX "xaddl %0, %1\n" + asm volatile("1:\t\n" + "mov $0x10000, %0\n\t" + LOCK_PREFIX "xaddl %0, %1\n" "movzwl %w0, %2\n\t" "shrl $16, %0\n\t" - "1:\t" + "2:\t" "cmpl %0, %2\n\t" - "je 2f\n\t" + "je 4f\n\t" + "3:\n\t" "rep ; nop\n\t" + ALTERNATIVE( "movzwl %1, %2\n\t" /* don't need lfence here, because loads are in-order */ - "jmp 1b\n" - "2:" - : "+r" (inc), "+m" (lock->slock), "=&r" (tmp) + "jmp 2b\n\t", "", X86_FEATURE_UNFAIR_SPINLOCK)"\n\t" + "cmp $0, %1\n\t" + "jne 3b\n\t" + "jmp 1b\n\t" + "4:" + : "=&r" (inc), "+m" (lock->slock), "=&r" (tmp) : : "memory", "cc"); } @@ -151,9 +171,11 @@ static __always_inline int __ticket_spin static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) { - asm volatile(UNLOCK_LOCK_PREFIX "incw %0" - : "+m" (lock->slock) + asm volatile(ALTERNATIVE(UNLOCK_LOCK_PREFIX "incw (%0);"ASM_NOP3, + UNLOCK_LOCK_ALT_PREFIX"movl $0, (%0)", + X86_FEATURE_UNFAIR_SPINLOCK) : + : "Q" (&lock->slock) : "memory", "cc"); } #endif diff -uprN linux-2.6.32/arch/x86/include/asm/stacktrace.h linux-2.6.32.ovz/arch/x86/include/asm/stacktrace.h --- linux-2.6.32/arch/x86/include/asm/stacktrace.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/stacktrace.h 2015-03-10 13:22:06.000000000 -0700 @@ -1,9 +1,38 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ + #ifndef _ASM_X86_STACKTRACE_H #define _ASM_X86_STACKTRACE_H +#include +#include + extern int kstack_depth_to_print; -int x86_is_stack_id(int id, char *name); +struct thread_info; +struct stacktrace_ops; + +typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo, + unsigned long *stack, + unsigned long bp, + const struct stacktrace_ops *ops, + void *data, + unsigned long *end, + int *graph); + +extern unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); + +extern unsigned long +print_context_stack_bp(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph); /* Generic stack tracer with callbacks */ @@ -14,10 +43,79 @@ struct stacktrace_ops { void (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ int (*stack)(void *data, char *name); + walk_stack_t walk_stack; }; void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, + unsigned long *stack, const struct stacktrace_ops *ops, void *data); +#ifdef CONFIG_X86_32 +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) +#else +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) +#endif + +#ifdef CONFIG_FRAME_POINTER +static inline unsigned long +stack_frame(struct task_struct *task, struct pt_regs *regs) +{ + unsigned long bp; + + if (regs) + return regs->bp; + + if (task == current) { + /* Grab bp right from our regs */ + get_bp(bp); + return bp; + } + + /* bp is the last reg pushed by switch_to */ + return *(unsigned long *)task->thread.sp; +} +#else +static inline unsigned long +stack_frame(struct task_struct *task, struct pt_regs *regs) +{ + return 0; +} +#endif + +extern void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl); + +extern void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl); + +extern unsigned int code_bytes; + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +struct stack_frame_ia32 { + u32 next_frame; + u32 return_address; +}; + +static inline unsigned long caller_frame_pointer(void) +{ + struct stack_frame *frame; + + get_bp(frame); + +#ifdef CONFIG_FRAME_POINTER + frame = frame->next_frame; +#endif + + return (unsigned long)frame; +} + #endif /* _ASM_X86_STACKTRACE_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/string_32.h linux-2.6.32.ovz/arch/x86/include/asm/string_32.h --- linux-2.6.32/arch/x86/include/asm/string_32.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/string_32.h 2015-03-10 13:21:34.000000000 -0700 @@ -177,10 +177,15 @@ static inline void *__memcpy3d(void *to, */ #ifndef CONFIG_KMEMCHECK + +#if (__GNUC__ >= 4) +#define memcpy(t, f, n) __builtin_memcpy(t, f, n) +#else #define memcpy(t, f, n) \ (__builtin_constant_p((n)) \ ? __constant_memcpy((t), (f), (n)) \ : __memcpy((t), (f), (n))) +#endif #else /* * kmemcheck becomes very happy if we use the REP instructions unconditionally, @@ -316,11 +321,15 @@ void *__constant_c_and_count_memset(void : __memset_generic((s), (c), (count))) #define __HAVE_ARCH_MEMSET +#if (__GNUC__ >= 4) +#define memset(s, c, count) __builtin_memset(s, c, count) +#else #define memset(s, c, count) \ (__builtin_constant_p(c) \ ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ (count)) \ : __memset((s), (c), (count))) +#endif /* * find the first occurrence of byte 'c', or 1 past the area if none diff -uprN linux-2.6.32/arch/x86/include/asm/svm.h linux-2.6.32.ovz/arch/x86/include/asm/svm.h --- linux-2.6.32/arch/x86/include/asm/svm.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/svm.h 2015-03-10 13:23:56.000000000 -0700 @@ -1,6 +1,135 @@ #ifndef __SVM_H #define __SVM_H +#define SVM_EXIT_READ_CR0 0x000 +#define SVM_EXIT_READ_CR3 0x003 +#define SVM_EXIT_READ_CR4 0x004 +#define SVM_EXIT_READ_CR8 0x008 +#define SVM_EXIT_WRITE_CR0 0x010 +#define SVM_EXIT_WRITE_CR3 0x013 +#define SVM_EXIT_WRITE_CR4 0x014 +#define SVM_EXIT_WRITE_CR8 0x018 +#define SVM_EXIT_READ_DR0 0x020 +#define SVM_EXIT_READ_DR1 0x021 +#define SVM_EXIT_READ_DR2 0x022 +#define SVM_EXIT_READ_DR3 0x023 +#define SVM_EXIT_READ_DR4 0x024 +#define SVM_EXIT_READ_DR5 0x025 +#define SVM_EXIT_READ_DR6 0x026 +#define SVM_EXIT_READ_DR7 0x027 +#define SVM_EXIT_WRITE_DR0 0x030 +#define SVM_EXIT_WRITE_DR1 0x031 +#define SVM_EXIT_WRITE_DR2 0x032 +#define SVM_EXIT_WRITE_DR3 0x033 +#define SVM_EXIT_WRITE_DR4 0x034 +#define SVM_EXIT_WRITE_DR5 0x035 +#define SVM_EXIT_WRITE_DR6 0x036 +#define SVM_EXIT_WRITE_DR7 0x037 +#define SVM_EXIT_EXCP_BASE 0x040 +#define SVM_EXIT_INTR 0x060 +#define SVM_EXIT_NMI 0x061 +#define SVM_EXIT_SMI 0x062 +#define SVM_EXIT_INIT 0x063 +#define SVM_EXIT_VINTR 0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ 0x066 +#define SVM_EXIT_GDTR_READ 0x067 +#define SVM_EXIT_LDTR_READ 0x068 +#define SVM_EXIT_TR_READ 0x069 +#define SVM_EXIT_IDTR_WRITE 0x06a +#define SVM_EXIT_GDTR_WRITE 0x06b +#define SVM_EXIT_LDTR_WRITE 0x06c +#define SVM_EXIT_TR_WRITE 0x06d +#define SVM_EXIT_RDTSC 0x06e +#define SVM_EXIT_RDPMC 0x06f +#define SVM_EXIT_PUSHF 0x070 +#define SVM_EXIT_POPF 0x071 +#define SVM_EXIT_CPUID 0x072 +#define SVM_EXIT_RSM 0x073 +#define SVM_EXIT_IRET 0x074 +#define SVM_EXIT_SWINT 0x075 +#define SVM_EXIT_INVD 0x076 +#define SVM_EXIT_PAUSE 0x077 +#define SVM_EXIT_HLT 0x078 +#define SVM_EXIT_INVLPG 0x079 +#define SVM_EXIT_INVLPGA 0x07a +#define SVM_EXIT_IOIO 0x07b +#define SVM_EXIT_MSR 0x07c +#define SVM_EXIT_TASK_SWITCH 0x07d +#define SVM_EXIT_FERR_FREEZE 0x07e +#define SVM_EXIT_SHUTDOWN 0x07f +#define SVM_EXIT_VMRUN 0x080 +#define SVM_EXIT_VMMCALL 0x081 +#define SVM_EXIT_VMLOAD 0x082 +#define SVM_EXIT_VMSAVE 0x083 +#define SVM_EXIT_STGI 0x084 +#define SVM_EXIT_CLGI 0x085 +#define SVM_EXIT_SKINIT 0x086 +#define SVM_EXIT_RDTSCP 0x087 +#define SVM_EXIT_ICEBP 0x088 +#define SVM_EXIT_WBINVD 0x089 +#define SVM_EXIT_MONITOR 0x08a +#define SVM_EXIT_MWAIT 0x08b +#define SVM_EXIT_MWAIT_COND 0x08c +#define SVM_EXIT_XSETBV 0x08d +#define SVM_EXIT_NPF 0x400 + +#define SVM_EXIT_ERR -1 + +#define SVM_EXIT_REASONS \ + { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR3, "read_cr3" }, \ + { SVM_EXIT_READ_CR4, "read_cr4" }, \ + { SVM_EXIT_READ_CR8, "read_cr8" }, \ + { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ + { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ + { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ + { SVM_EXIT_READ_DR0, "read_dr0" }, \ + { SVM_EXIT_READ_DR1, "read_dr1" }, \ + { SVM_EXIT_READ_DR2, "read_dr2" }, \ + { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ + { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ + { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ + { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_INTR, "interrupt" }, \ + { SVM_EXIT_NMI, "nmi" }, \ + { SVM_EXIT_SMI, "smi" }, \ + { SVM_EXIT_INIT, "init" }, \ + { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_HLT, "hlt" }, \ + { SVM_EXIT_INVLPG, "invlpg" }, \ + { SVM_EXIT_INVLPGA, "invlpga" }, \ + { SVM_EXIT_IOIO, "io" }, \ + { SVM_EXIT_MSR, "msr" }, \ + { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_SHUTDOWN, "shutdown" }, \ + { SVM_EXIT_VMRUN, "vmrun" }, \ + { SVM_EXIT_VMMCALL, "hypercall" }, \ + { SVM_EXIT_VMLOAD, "vmload" }, \ + { SVM_EXIT_VMSAVE, "vmsave" }, \ + { SVM_EXIT_STGI, "stgi" }, \ + { SVM_EXIT_CLGI, "clgi" }, \ + { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_WBINVD, "wbinvd" }, \ + { SVM_EXIT_MONITOR, "monitor" }, \ + { SVM_EXIT_MWAIT, "mwait" }, \ + { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_NPF, "npf" } + +#ifdef __KERNEL__ + enum { INTERCEPT_INTR, INTERCEPT_NMI, @@ -47,6 +176,7 @@ enum { INTERCEPT_MONITOR, INTERCEPT_MWAIT, INTERCEPT_MWAIT_COND, + INTERCEPT_XSETBV, }; @@ -57,7 +187,8 @@ struct __attribute__ ((__packed__)) vmcb u16 intercept_dr_write; u32 intercept_exceptions; u64 intercept; - u8 reserved_1[44]; + u8 reserved_1[42]; + u16 pause_filter_count; u64 iopm_base_pa; u64 msrpm_base_pa; u64 tsc_offset; @@ -80,12 +211,19 @@ struct __attribute__ ((__packed__)) vmcb u32 event_inj_err; u64 nested_cr3; u64 lbr_ctl; - u8 reserved_5[832]; + u32 clean; + u32 reserved_5; + u64 next_rip; + u8 insn_len; + u8 insn_bytes[15]; + u8 reserved_6[800]; }; #define TLB_CONTROL_DO_NOTHING 0 #define TLB_CONTROL_FLUSH_ALL_ASID 1 +#define TLB_CONTROL_FLUSH_ASID 3 +#define TLB_CONTROL_FLUSH_ASID_LOCAL 7 #define V_TPR_MASK 0x0f @@ -237,80 +375,9 @@ struct __attribute__ ((__packed__)) vmcb #define SVM_EXITINFOSHIFT_TS_REASON_IRET 36 #define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 +#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 -#define SVM_EXIT_READ_CR0 0x000 -#define SVM_EXIT_READ_CR3 0x003 -#define SVM_EXIT_READ_CR4 0x004 -#define SVM_EXIT_READ_CR8 0x008 -#define SVM_EXIT_WRITE_CR0 0x010 -#define SVM_EXIT_WRITE_CR3 0x013 -#define SVM_EXIT_WRITE_CR4 0x014 -#define SVM_EXIT_WRITE_CR8 0x018 -#define SVM_EXIT_READ_DR0 0x020 -#define SVM_EXIT_READ_DR1 0x021 -#define SVM_EXIT_READ_DR2 0x022 -#define SVM_EXIT_READ_DR3 0x023 -#define SVM_EXIT_READ_DR4 0x024 -#define SVM_EXIT_READ_DR5 0x025 -#define SVM_EXIT_READ_DR6 0x026 -#define SVM_EXIT_READ_DR7 0x027 -#define SVM_EXIT_WRITE_DR0 0x030 -#define SVM_EXIT_WRITE_DR1 0x031 -#define SVM_EXIT_WRITE_DR2 0x032 -#define SVM_EXIT_WRITE_DR3 0x033 -#define SVM_EXIT_WRITE_DR4 0x034 -#define SVM_EXIT_WRITE_DR5 0x035 -#define SVM_EXIT_WRITE_DR6 0x036 -#define SVM_EXIT_WRITE_DR7 0x037 -#define SVM_EXIT_EXCP_BASE 0x040 -#define SVM_EXIT_INTR 0x060 -#define SVM_EXIT_NMI 0x061 -#define SVM_EXIT_SMI 0x062 -#define SVM_EXIT_INIT 0x063 -#define SVM_EXIT_VINTR 0x064 -#define SVM_EXIT_CR0_SEL_WRITE 0x065 -#define SVM_EXIT_IDTR_READ 0x066 -#define SVM_EXIT_GDTR_READ 0x067 -#define SVM_EXIT_LDTR_READ 0x068 -#define SVM_EXIT_TR_READ 0x069 -#define SVM_EXIT_IDTR_WRITE 0x06a -#define SVM_EXIT_GDTR_WRITE 0x06b -#define SVM_EXIT_LDTR_WRITE 0x06c -#define SVM_EXIT_TR_WRITE 0x06d -#define SVM_EXIT_RDTSC 0x06e -#define SVM_EXIT_RDPMC 0x06f -#define SVM_EXIT_PUSHF 0x070 -#define SVM_EXIT_POPF 0x071 -#define SVM_EXIT_CPUID 0x072 -#define SVM_EXIT_RSM 0x073 -#define SVM_EXIT_IRET 0x074 -#define SVM_EXIT_SWINT 0x075 -#define SVM_EXIT_INVD 0x076 -#define SVM_EXIT_PAUSE 0x077 -#define SVM_EXIT_HLT 0x078 -#define SVM_EXIT_INVLPG 0x079 -#define SVM_EXIT_INVLPGA 0x07a -#define SVM_EXIT_IOIO 0x07b -#define SVM_EXIT_MSR 0x07c -#define SVM_EXIT_TASK_SWITCH 0x07d -#define SVM_EXIT_FERR_FREEZE 0x07e -#define SVM_EXIT_SHUTDOWN 0x07f -#define SVM_EXIT_VMRUN 0x080 -#define SVM_EXIT_VMMCALL 0x081 -#define SVM_EXIT_VMLOAD 0x082 -#define SVM_EXIT_VMSAVE 0x083 -#define SVM_EXIT_STGI 0x084 -#define SVM_EXIT_CLGI 0x085 -#define SVM_EXIT_SKINIT 0x086 -#define SVM_EXIT_RDTSCP 0x087 -#define SVM_EXIT_ICEBP 0x088 -#define SVM_EXIT_WBINVD 0x089 -#define SVM_EXIT_MONITOR 0x08a -#define SVM_EXIT_MWAIT 0x08b -#define SVM_EXIT_MWAIT_COND 0x08c -#define SVM_EXIT_NPF 0x400 - -#define SVM_EXIT_ERR -1 +#define SVM_EXITINFO_REG_MASK 0x0F #define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ @@ -323,3 +390,4 @@ struct __attribute__ ((__packed__)) vmcb #endif +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/syscalls.h linux-2.6.32.ovz/arch/x86/include/asm/syscalls.h --- linux-2.6.32/arch/x86/include/asm/syscalls.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/syscalls.h 2015-03-10 13:21:11.000000000 -0700 @@ -55,8 +55,6 @@ struct sel_arg_struct; struct oldold_utsname; struct old_utsname; -asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); asmlinkage int old_mmap(struct mmap_arg_struct __user *); asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); diff -uprN linux-2.6.32/arch/x86/include/asm/sys_ia32.h linux-2.6.32.ovz/arch/x86/include/asm/sys_ia32.h --- linux-2.6.32/arch/x86/include/asm/sys_ia32.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/sys_ia32.h 2015-03-10 13:21:11.000000000 -0700 @@ -62,9 +62,6 @@ asmlinkage long sys32_pwrite(unsigned in asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); - struct oldold_utsname; struct old_utsname; asmlinkage long sys32_olduname(struct oldold_utsname __user *); diff -uprN linux-2.6.32/arch/x86/include/asm/system.h linux-2.6.32.ovz/arch/x86/include/asm/system.h --- linux-2.6.32/arch/x86/include/asm/system.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/system.h 2015-03-10 13:22:21.000000000 -0700 @@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct t struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); +extern void show_regs_common(void); #ifdef CONFIG_X86_32 diff -uprN linux-2.6.32/arch/x86/include/asm/thread_info.h linux-2.6.32.ovz/arch/x86/include/asm/thread_info.h --- linux-2.6.32/arch/x86/include/asm/thread_info.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/thread_info.h 2015-06-23 04:16:16.000000000 -0700 @@ -83,19 +83,19 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ +#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ -#define TIF_ABI_PENDING 19 #define TIF_MEMDIE 20 #define TIF_DEBUG 21 /* uses debug registers */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ #define TIF_FREEZE 23 /* is freezing for suspend */ #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ -#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ -#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ +#define TIF_BLOCKSTEP 25 /* set when we want DEBUGCTLMSR_BTF */ #define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ +#define TIF_RESUME 29 #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -107,18 +107,18 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) +#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) -#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING) #define _TIF_DEBUG (1 << TIF_DEBUG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FREEZE (1 << TIF_FREEZE) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) -#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) -#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_BLOCKSTEP (1 << TIF_BLOCKSTEP) #define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) +#define _TIF_RESUME (1<status & TS_POLLING) diff -uprN linux-2.6.32/arch/x86/include/asm/timer.h linux-2.6.32.ovz/arch/x86/include/asm/timer.h --- linux-2.6.32/arch/x86/include/asm/timer.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/timer.h 2015-03-10 13:24:01.000000000 -0700 @@ -38,6 +38,22 @@ extern int no_timer_check; * (mathieu.desnoyers@polymtl.ca) * * -johnstul@us.ibm.com "math is hard, lets go shopping!" + * + * In: + * + * ns = cycles * cyc2ns_scale / SC + * + * Although we may still have enough bits to store the value of ns, + * in some cases, we may not have enough bits to store cycles * cyc2ns_scale, + * leading to an incorrect result. + * + * To avoid this, we can decompose 'cycles' into quotient and remainder + * of division by SC. Then, + * + * ns = (quot * SC + rem) * cyc2ns_scale / SC + * = quot * cyc2ns_scale + (rem * cyc2ns_scale) / SC + * + * - sqazi@google.com */ DECLARE_PER_CPU(unsigned long, cyc2ns); @@ -49,7 +65,8 @@ static inline unsigned long long __cycle { int cpu = smp_processor_id(); unsigned long long ns = per_cpu(cyc2ns_offset, cpu); - ns += cyc * per_cpu(cyc2ns, cpu) >> CYC2NS_SCALE_FACTOR; + ns += mult_frac(cyc, per_cpu(cyc2ns, cpu), + (1UL << CYC2NS_SCALE_FACTOR)); return ns; } diff -uprN linux-2.6.32/arch/x86/include/asm/tlbflush.h linux-2.6.32.ovz/arch/x86/include/asm/tlbflush.h --- linux-2.6.32/arch/x86/include/asm/tlbflush.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/tlbflush.h 2015-06-23 04:16:16.000000000 -0700 @@ -105,6 +105,7 @@ static inline void flush_tlb_page(struct if (vma->vm_mm == current->active_mm) __flush_tlb_one(addr); } +EXPORT_SYMBOL(flush_tlb_page); static inline void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) diff -uprN linux-2.6.32/arch/x86/include/asm/topology.h linux-2.6.32.ovz/arch/x86/include/asm/topology.h --- linux-2.6.32/arch/x86/include/asm/topology.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/topology.h 2015-03-10 13:24:08.000000000 -0700 @@ -35,11 +35,16 @@ # endif #endif -/* Node not present */ -#define NUMA_NO_NODE (-1) +/* + * to preserve the visibility of NUMA_NO_NODE definition, + * moved to there from here. May be used independent of + * CONFIG_NUMA. + */ +#include #ifdef CONFIG_NUMA #include + #include #ifdef CONFIG_X86_32 @@ -194,7 +199,8 @@ static inline void arch_fix_phys_package } struct pci_bus; -void x86_pci_root_bus_res_quirks(struct pci_bus *b); +int x86_pci_root_bus_node(int bus); +void x86_pci_root_bus_resources(int bus, struct list_head *resources); #ifdef CONFIG_SMP #define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ @@ -202,17 +208,4 @@ void x86_pci_root_bus_res_quirks(struct #define smt_capable() (smp_num_siblings > 1) #endif -#ifdef CONFIG_NUMA -extern int get_mp_bus_to_node(int busnum); -extern void set_mp_bus_to_node(int busnum, int node); -#else -static inline int get_mp_bus_to_node(int busnum) -{ - return 0; -} -static inline void set_mp_bus_to_node(int busnum, int node) -{ -} -#endif - #endif /* _ASM_X86_TOPOLOGY_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/trace/irq_vectors.h linux-2.6.32.ovz/arch/x86/include/asm/trace/irq_vectors.h --- linux-2.6.32/arch/x86/include/asm/trace/irq_vectors.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/trace/irq_vectors.h 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,114 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq_vectors + +#if !defined(_TRACE_IRQ_VECTORS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_IRQ_VECTORS_H + +#include + +extern void trace_irq_vector_regfunc(void); +extern void trace_irq_vector_unregfunc(void); + +DECLARE_EVENT_CLASS(x86_irq_vector, + + TP_PROTO(int vector), + + TP_ARGS(vector), + + TP_STRUCT__entry( + __field( int, vector ) + ), + + TP_fast_assign( + __entry->vector = vector; + ), + + TP_printk("vector=%d", __entry->vector) ); + +#define DEFINE_IRQ_VECTOR_EVENT(name) \ +DEFINE_EVENT_FN(x86_irq_vector, name##_entry, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), \ + trace_irq_vector_regfunc, \ + trace_irq_vector_unregfunc); \ +DEFINE_EVENT_FN(x86_irq_vector, name##_exit, \ + TP_PROTO(int vector), \ + TP_ARGS(vector), \ + trace_irq_vector_regfunc, \ + trace_irq_vector_unregfunc); + + +/* + * local_timer - called when entering/exiting a local timer interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(local_timer); + +/* + * reschedule - called when entering/exiting a reschedule vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(reschedule); + +/* + * spurious_apic - called when entering/exiting a spurious apic vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(spurious_apic); + +/* + * error_apic - called when entering/exiting an error apic vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(error_apic); + +/* + * x86_platform_ipi - called when entering/exiting a x86 platform ipi interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(x86_platform_ipi); + +/* + * irq_work - called when entering/exiting a irq work interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(irq_work); + +/* + * call_function - called when entering/exiting a call function interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(call_function); + +/* + * call_function_single - called when entering/exiting a call function + * single interrupt vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(call_function_single); + +/* + * threshold_apic - called when entering/exiting a threshold apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(threshold_apic); + +/* + * thermal_apic - called when entering/exiting a thermal apic interrupt + * vector handler + */ +DEFINE_IRQ_VECTOR_EVENT(thermal_apic); + +/* + * monitor_ipi - called when ... + */ +DEFINE_IRQ_VECTOR_EVENT(monitor_ipi); + +/* + * monitor_posted_interrupt - called when ... + */ +DEFINE_IRQ_VECTOR_EVENT(monitor_posted_interrupt); + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE irq_vectors +#endif /* _TRACE_IRQ_VECTORS_H */ + +/* This part must be outside protection */ +#include diff -uprN linux-2.6.32/arch/x86/include/asm/trace_clock.h linux-2.6.32.ovz/arch/x86/include/asm/trace_clock.h --- linux-2.6.32/arch/x86/include/asm/trace_clock.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/trace_clock.h 2015-03-10 13:23:58.000000000 -0700 @@ -0,0 +1,20 @@ +#ifndef _ASM_X86_TRACE_CLOCK_H +#define _ASM_X86_TRACE_CLOCK_H + +#include +#include + +#ifdef CONFIG_X86_TSC + +extern u64 notrace trace_clock_x86_tsc(void); + +# define ARCH_TRACE_CLOCKS \ + { trace_clock_x86_tsc, "x86-tsc", .in_ns = 0 }, + +#else /* !CONFIG_X86_TSC */ + +#define ARCH_TRACE_CLOCKS + +#endif + +#endif /* _ASM_X86_TRACE_CLOCK_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/trampoline.h linux-2.6.32.ovz/arch/x86/include/asm/trampoline.h --- linux-2.6.32/arch/x86/include/asm/trampoline.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/trampoline.h 2015-03-10 13:22:14.000000000 -0700 @@ -13,15 +13,18 @@ extern unsigned char *trampoline_base; extern unsigned long init_rsp; extern unsigned long initial_code; +extern unsigned long initial_page_table; extern unsigned long initial_gs; #define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) #define TRAMPOLINE_BASE 0x6000 extern unsigned long setup_trampoline(void); +extern void __init setup_trampoline_page_table(void); extern void __init reserve_trampoline_memory(void); #else static inline void reserve_trampoline_memory(void) {}; +extern void __init setup_trampoline_page_table(void) {}; #endif /* CONFIG_X86_TRAMPOLINE */ #endif /* __ASSEMBLY__ */ diff -uprN linux-2.6.32/arch/x86/include/asm/traps.h linux-2.6.32.ovz/arch/x86/include/asm/traps.h --- linux-2.6.32/arch/x86/include/asm/traps.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/traps.h 2015-06-23 04:16:16.000000000 -0700 @@ -1,6 +1,8 @@ #ifndef _ASM_X86_TRAPS_H #define _ASM_X86_TRAPS_H +#include + #include #include /* TRAP_TRACE, ... */ @@ -86,4 +88,31 @@ asmlinkage void smp_thermal_interrupt(vo asmlinkage void mce_threshold_interrupt(void); #endif +void do_cpuid_fault(struct pt_regs *); + +/* Interrupts/Exceptions */ +enum { + X86_TRAP_DE = 0, /* 0, Divide-by-zero */ + X86_TRAP_DB, /* 1, Debug */ + X86_TRAP_NMI, /* 2, Non-maskable Interrupt */ + X86_TRAP_BP, /* 3, Breakpoint */ + X86_TRAP_OF, /* 4, Overflow */ + X86_TRAP_BR, /* 5, Bound Range Exceeded */ + X86_TRAP_UD, /* 6, Invalid Opcode */ + X86_TRAP_NM, /* 7, Device Not Available */ + X86_TRAP_DF, /* 8, Double Fault */ + X86_TRAP_OLD_MF, /* 9, Coprocessor Segment Overrun */ + X86_TRAP_TS, /* 10, Invalid TSS */ + X86_TRAP_NP, /* 11, Segment Not Present */ + X86_TRAP_SS, /* 12, Stack Segment Fault */ + X86_TRAP_GP, /* 13, General Protection Fault */ + X86_TRAP_PF, /* 14, Page Fault */ + X86_TRAP_SPURIOUS, /* 15, Spurious Interrupt */ + X86_TRAP_MF, /* 16, x87 Floating-Point Exception */ + X86_TRAP_AC, /* 17, Alignment Check */ + X86_TRAP_MC, /* 18, Machine Check */ + X86_TRAP_XF, /* 19, SIMD Floating-Point Exception */ + X86_TRAP_IRET = 32, /* 32, IRET Exception */ +}; + #endif /* _ASM_X86_TRAPS_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/tsc.h linux-2.6.32.ovz/arch/x86/include/asm/tsc.h --- linux-2.6.32/arch/x86/include/asm/tsc.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/tsc.h 2015-06-23 04:16:16.000000000 -0700 @@ -24,7 +24,7 @@ static inline cycles_t get_cycles(void) unsigned long long ret = 0; #ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) + if (WARN_ON_ONCE(!cpu_has_tsc)) return 0; #endif rdtscll(ret); @@ -51,6 +51,8 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); +extern int tsc_clocksource_reliable; + /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: @@ -59,5 +61,7 @@ extern void check_tsc_sync_source(int cp extern void check_tsc_sync_target(void); extern int notsc_setup(char *); +extern void tsc_save_sched_clock_state(void); +extern void tsc_restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/types.h linux-2.6.32.ovz/arch/x86/include/asm/types.h --- linux-2.6.32/arch/x86/include/asm/types.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/types.h 2015-03-10 13:23:18.000000000 -0700 @@ -1,20 +1,12 @@ #ifndef _ASM_X86_TYPES_H #define _ASM_X86_TYPES_H -#define dma_addr_t dma_addr_t - #include #ifdef __KERNEL__ #ifndef __ASSEMBLY__ typedef u64 dma64_addr_t; -#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G) -/* DMA addresses come in 32-bit and 64-bit flavours. */ -typedef u64 dma_addr_t; -#else -typedef u32 dma_addr_t; -#endif #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff -uprN linux-2.6.32/arch/x86/include/asm/uaccess_64.h linux-2.6.32.ovz/arch/x86/include/asm/uaccess_64.h --- linux-2.6.32/arch/x86/include/asm/uaccess_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/uaccess_64.h 2015-03-10 13:22:35.000000000 -0700 @@ -17,6 +17,10 @@ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len); +__must_check unsigned long +copy_user_generic_string(void *to, const void *from, unsigned len); +__must_check unsigned long +copy_user_generic_unrolled(void *to, const void *from, unsigned len); __must_check unsigned long copy_to_user(void __user *to, const void *from, unsigned len); diff -uprN linux-2.6.32/arch/x86/include/asm/uaccess.h linux-2.6.32.ovz/arch/x86/include/asm/uaccess.h --- linux-2.6.32/arch/x86/include/asm/uaccess.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/uaccess.h 2015-03-10 13:23:53.000000000 -0700 @@ -556,6 +556,9 @@ struct __large_struct { unsigned long bu #endif /* CONFIG_X86_WP_WORKS_OK */ +extern unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n); + /* * movsl can be slow when source and dest are not both 8-byte aligned */ diff -uprN linux-2.6.32/arch/x86/include/asm/unistd_32.h linux-2.6.32.ovz/arch/x86/include/asm/unistd_32.h --- linux-2.6.32/arch/x86/include/asm/unistd_32.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/unistd_32.h 2015-06-23 04:16:16.000000000 -0700 @@ -342,10 +342,36 @@ #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 #define __NR_perf_event_open 336 +#define __NR_recvmmsg 337 +/* #define __NR_fanotify_init 338 */ +/* #define __NR_fanotify_mark 339 */ +/* #define __NR_prlimit64 340 */ +#define __NR_name_to_handle_at 341 +#define __NR_open_by_handle_at 342 +#define __NR_clock_adjtime 343 +#define __NR_syncfs 344 +#define __NR_sendmmsg 345 +#define __NR_setns 346 +#define __NR_process_vm_readv 347 +#define __NR_process_vm_writev 348 +#define __NR_fairsched_mknod 500 /* FairScheduler syscalls */ +#define __NR_fairsched_rmnod 501 +#define __NR_fairsched_chwt 502 +#define __NR_fairsched_mvpr 503 +#define __NR_fairsched_rate 504 +#define __NR_fairsched_vcpus 505 +#define __NR_fairsched_cpumask 506 +#define __NR_fairsched_nodemask 507 +#define __NR_getluid 510 +#define __NR_setluid 511 +#define __NR_setublimit 512 +#define __NR_ubstat 513 +#define __NR_lchmod 516 +#define __NR_lutime 517 #ifdef __KERNEL__ -#define NR_syscalls 337 +#define NR_syscalls 518 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff -uprN linux-2.6.32/arch/x86/include/asm/unistd_64.h linux-2.6.32.ovz/arch/x86/include/asm/unistd_64.h --- linux-2.6.32/arch/x86/include/asm/unistd_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/unistd_64.h 2015-06-23 04:16:16.000000000 -0700 @@ -661,6 +661,61 @@ __SYSCALL(__NR_pwritev, sys_pwritev) __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #define __NR_perf_event_open 298 __SYSCALL(__NR_perf_event_open, sys_perf_event_open) +#define __NR_recvmmsg 299 +__SYSCALL(__NR_recvmmsg, sys_recvmmsg) +#define __NR_fanotify_init 300 +__SYSCALL(__NR_fanotify_init, sys_ni_syscall) +#define __NR_fanotify_mark 301 +__SYSCALL(__NR_fanotify_mark, sys_ni_syscall) +#define __NR_prlimit64 302 +__SYSCALL(__NR_prlimit64, sys_ni_syscall) +#define __NR_name_to_handle_at 303 +__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at) +#define __NR_open_by_handle_at 304 +__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) +#define __NR_clock_adjtime 305 +__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) +#define __NR_syncfs 306 +__SYSCALL(__NR_syncfs, sys_syncfs) +#define __NR_sendmmsg 307 +__SYSCALL(__NR_sendmmsg, sys_sendmmsg) +#define __NR_setns 308 +__SYSCALL(__NR_setns, sys_setns) +#define __NR_get_cpu 309 +__SYSCALL(__NR_get_cpu, sys_ni_syscall) +#define __NR_process_vm_readv 310 +__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) +#define __NR_process_vm_writev 311 +__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) +#define __NR_fairsched_nodemask 497 +__SYSCALL(__NR_fairsched_nodemask, sys_fairsched_nodemask) +#define __NR_fairsched_cpumask 498 +__SYSCALL(__NR_fairsched_cpumask, sys_fairsched_cpumask) +#define __NR_fairsched_vcpus 499 +__SYSCALL(__NR_fairsched_vcpus, sys_fairsched_vcpus) +#define __NR_getluid 500 +__SYSCALL(__NR_getluid, sys_getluid) +#define __NR_setluid 501 +__SYSCALL(__NR_setluid, sys_setluid) +#define __NR_setublimit 502 +__SYSCALL(__NR_setublimit, sys_setublimit) +#define __NR_ubstat 503 +__SYSCALL(__NR_ubstat, sys_ubstat) +#define __NR_fairsched_mknod 504 /* FairScheduler syscalls */ +__SYSCALL(__NR_fairsched_mknod, sys_fairsched_mknod) +#define __NR_fairsched_rmnod 505 +__SYSCALL(__NR_fairsched_rmnod, sys_fairsched_rmnod) +#define __NR_fairsched_chwt 506 +__SYSCALL(__NR_fairsched_chwt, sys_fairsched_chwt) +#define __NR_fairsched_mvpr 507 +__SYSCALL(__NR_fairsched_mvpr, sys_fairsched_mvpr) +#define __NR_fairsched_rate 508 +__SYSCALL(__NR_fairsched_rate, sys_fairsched_rate) +#define __NR_lchmod 509 +__SYSCALL(__NR_lchmod, sys_lchmod) +#define __NR_lutime 510 +__SYSCALL(__NR_lutime, sys_lutime) + #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR @@ -685,6 +740,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf #define __ARCH_WANT_SYS_RT_SIGSUSPEND #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_TIME +#define __ARCH_WANT_SYS_RT_SIGSUSPEND #endif /* __NO_STUBS */ #ifdef __KERNEL__ diff -uprN linux-2.6.32/arch/x86/include/asm/user.h linux-2.6.32.ovz/arch/x86/include/asm/user.h --- linux-2.6.32/arch/x86/include/asm/user.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/user.h 2015-03-10 13:21:24.000000000 -0700 @@ -1,5 +1,63 @@ +#ifndef _ASM_X86_USER_H +#define _ASM_X86_USER_H + #ifdef CONFIG_X86_32 # include "user_32.h" #else # include "user_64.h" #endif + +#include + +struct user_ymmh_regs { + /* 16 * 16 bytes for each YMMH-reg */ + __u32 ymmh_space[64]; +}; + +struct user_xsave_hdr { + __u64 xstate_bv; + __u64 reserved1[2]; + __u64 reserved2[5]; +}; + +/* + * The structure layout of user_xstateregs, used for exporting the + * extended register state through ptrace and core-dump (NT_X86_XSTATE note) + * interfaces will be same as the memory layout of xsave used by the processor + * (except for the bytes 464..511, which can be used by the software) and hence + * the size of this structure varies depending on the features supported by the + * processor and OS. The size of the structure that users need to use can be + * obtained by doing: + * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx); + * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.) + * need to use. + * + * For now, only the first 8 bytes of the software usable bytes[464..471] will + * be used and will be set to OS enabled xstate mask (which is same as the + * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump + * remotely, etc.) can use this mask as well as the mask saved in the + * xstate_hdr bytes and interpret what states the processor/OS supports + * and what states are in modified/initialized conditions for the + * particular process/thread. + * + * Also when the user modifies certain state FP/SSE/etc through the + * ptrace interface, they must ensure that the xsave_hdr.xstate_bv + * bytes[512..519] of the memory layout are updated correspondingly. + * i.e., for example when FP state is modified to a non-init state, + * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to + * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc. + */ +#define USER_XSTATE_FX_SW_WORDS 6 +#define USER_XSTATE_XCR0_WORD 0 + +struct user_xstateregs { + struct { + __u64 fpx_space[58]; + __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS]; + } i387; + struct user_xsave_hdr xsave_hdr; + struct user_ymmh_regs ymmh; + /* further processor state extensions go here */ +}; + +#endif /* _ASM_X86_USER_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/uv/bios.h linux-2.6.32.ovz/arch/x86/include/asm/uv/bios.h --- linux-2.6.32/arch/x86/include/asm/uv/bios.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/uv/bios.h 2015-03-10 13:21:37.000000000 -0700 @@ -18,8 +18,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ #include @@ -36,7 +36,8 @@ enum uv_bios_cmd { UV_BIOS_WATCHLIST_ALLOC, UV_BIOS_WATCHLIST_FREE, UV_BIOS_MEMPROTECT, - UV_BIOS_GET_PARTITION_ADDR + UV_BIOS_GET_PARTITION_ADDR, + UV_BIOS_SET_LEGACY_VGA_TARGET }; /* @@ -76,15 +77,6 @@ union partition_info_u { }; }; -union uv_watchlist_u { - u64 val; - struct { - u64 blade : 16, - size : 32, - filler : 16; - }; -}; - enum uv_memprotect { UV_MEMPROT_RESTRICT_ACCESS, UV_MEMPROT_ALLOW_AMO, @@ -98,13 +90,14 @@ extern s64 uv_bios_call(enum uv_bios_cmd extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); +extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); -extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, +extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, unsigned long *); extern int uv_bios_mq_watchlist_free(int, int); extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); +extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus); extern void uv_bios_init(void); @@ -113,6 +106,7 @@ extern int uv_type; extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; +extern long system_serial_number; #define partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff -uprN linux-2.6.32/arch/x86/include/asm/uv/uv_bau.h linux-2.6.32.ovz/arch/x86/include/asm/uv/uv_bau.h --- linux-2.6.32/arch/x86/include/asm/uv/uv_bau.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/uv/uv_bau.h 2015-03-10 13:24:02.000000000 -0700 @@ -5,7 +5,7 @@ * * SGI UV Broadcast Assist Unit definitions * - * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_BAU_H @@ -26,55 +26,162 @@ * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. * - * We will use 31 sets, one for sending BAU messages from each of the 32 - * cpu's on the node. + * We will use one set for sending BAU messages from each of the + * cpu's on the uvhub. * * TLB shootdown will use the first of the 8 descriptors of each set. * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). */ -#define UV_ITEMS_PER_DESCRIPTOR 8 -#define UV_CPUS_PER_ACT_STATUS 32 +#define MAX_CPUS_PER_UVHUB 64 +#define MAX_CPUS_PER_SOCKET 32 +#define ADP_SZ 64 /* hardware-provided max. */ +#define UV_CPUS_PER_AS 32 /* hardware-provided max. */ +#define ITEMS_PER_DESC 8 +/* the 'throttle' to prevent the hardware stay-busy bug */ +#define MAX_BAU_CONCURRENT 3 #define UV_ACT_STATUS_MASK 0x3 #define UV_ACT_STATUS_SIZE 2 -#define UV_ADP_SIZE 32 #define UV_DISTRIBUTION_SIZE 256 #define UV_SW_ACK_NPENDING 8 -#define UV_NET_ENDPOINT_INTD 0x38 -#define UV_DESC_BASE_PNODE_SHIFT 49 +#define UV1_NET_ENDPOINT_INTD 0x38 +#define UV2_NET_ENDPOINT_INTD 0x28 +#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \ + UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD) +#define UV_DESC_PSHIFT 49 #define UV_PAYLOADQ_PNODE_SHIFT 49 #define UV_PTC_BASENAME "sgi_uv/ptc_statistics" +#define UV_BAU_BASENAME "sgi_uv/bau_tunables" +#define UV_BAU_TUNABLES_DIR "sgi_uv" +#define UV_BAU_TUNABLES_FILE "bau_tunables" +#define WHITESPACE " \t\n" +#define uv_mmask ((1UL << uv_hub_info->m_val) - 1) #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) +#define cpubit_isset(cpu, bau_local_cpumask) \ + test_bit((cpu), (bau_local_cpumask).bits) +/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ /* - * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 - */ -#define DESC_STATUS_IDLE 0 -#define DESC_STATUS_ACTIVE 1 -#define DESC_STATUS_DESTINATION_TIMEOUT 2 -#define DESC_STATUS_SOURCE_TIMEOUT 3 + * UV2: Bit 19 selects between + * (0): 10 microsecond timebase and + * (1): 80 microseconds + * we're using 560us, similar to UV1: 65 units of 10us + */ +#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL) +#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL) + +#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \ + UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \ + UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD) + +#define BAU_MISC_CONTROL_MULT_MASK 3 + +#define UVH_AGING_PRESCALE_SEL 0x000000b000UL +/* [30:28] URGENCY_7 an index into a table of times */ +#define BAU_URGENCY_7_SHIFT 28 +#define BAU_URGENCY_7_MASK 7 + +#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL +/* [45:40] BAU - BAU transaction timeout select - a multiplier */ +#define BAU_TRANS_SHIFT 40 +#define BAU_TRANS_MASK 0x3f + +/* + * shorten some awkward names + */ +#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT +#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT +#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT +#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD +#define write_gmmr uv_write_global_mmr64 +#define write_lmmr uv_write_local_mmr +#define read_lmmr uv_read_local_mmr +#define read_gmmr uv_read_global_mmr64 /* - * source side threshholds at which message retries print a warning + * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 */ -#define SOURCE_TIMEOUT_LIMIT 20 -#define DESTINATION_TIMEOUT_LIMIT 20 +#define DS_IDLE 0 +#define DS_ACTIVE 1 +#define DS_DESTINATION_TIMEOUT 2 +#define DS_SOURCE_TIMEOUT 3 +/* + * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2 + * values 1 and 3 will not occur + * Decoded meaning ERROR BUSY AUX ERR + * ------------------------------- ---- ----- ------- + * IDLE 0 0 0 + * BUSY (active) 0 1 0 + * SW Ack Timeout (destination) 1 0 0 + * SW Ack INTD rejected (strong NACK) 1 0 1 + * Source Side Time Out Detected 1 1 0 + * Destination Side PUT Failed 1 1 1 + */ +#define UV2H_DESC_IDLE 0 +#define UV2H_DESC_BUSY 2 +#define UV2H_DESC_DEST_TIMEOUT 4 +#define UV2H_DESC_DEST_STRONG_NACK 5 +#define UV2H_DESC_SOURCE_TIMEOUT 6 +#define UV2H_DESC_DEST_PUT_ERR 7 + +/* + * delay for 'plugged' timeout retries, in microseconds + */ +#define PLUGGED_DELAY 10 + +/* + * threshholds at which to use IPI to free resources + */ +/* after this # consecutive 'plugged' timeouts, use IPI to release resources */ +#define PLUGSB4RESET 100 +/* after this many consecutive timeouts, use IPI to release resources */ +#define TIMEOUTSB4RESET 1 +/* at this number uses of IPI to release resources, giveup the request */ +#define IPI_RESET_LIMIT 1 +/* after this # consecutive successes, bump up the throttle if it was lowered */ +#define COMPLETE_THRESHOLD 5 + +#define UV_LB_SUBNODEID 0x10 + +/* these two are the same for UV1 and UV2: */ +#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT +#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK +/* 4 bits of software ack period */ +#define UV2_ACK_MASK 0x7UL +#define UV2_ACK_UNITS_SHFT 3 +#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT +#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT /* * number of entries in the destination side payload queue */ -#define DEST_Q_SIZE 17 +#define DEST_Q_SIZE 20 /* * number of destination side software ack resources */ #define DEST_NUM_RESOURCES 8 -#define MAX_CPUS_PER_NODE 32 /* * completion statuses for sending a TLB flush message */ -#define FLUSH_RETRY 1 -#define FLUSH_GIVEUP 2 -#define FLUSH_COMPLETE 3 +#define FLUSH_RETRY_PLUGGED 1 +#define FLUSH_RETRY_TIMEOUT 2 +#define FLUSH_GIVEUP 3 +#define FLUSH_COMPLETE 4 +#define FLUSH_RETRY_BUSYBUG 5 + +/* + * tuning the action when the numalink network is extremely delayed + */ +#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in + microseconds */ +#define CONGESTED_REPS 10 /* long delays averaged over + this many broadcasts */ +#define CONGESTED_PERIOD 30 /* time for the bau to be + disabled, in seconds */ +/* see msg_type: */ +#define MSG_NOOP 0 +#define MSG_REGULAR 1 +#define MSG_RETRY 2 /* * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) @@ -83,20 +190,20 @@ * The distribution specification (32 bytes) is interpreted as a 256-bit * distribution vector. Adjacent bits correspond to consecutive even numbered * nodeIDs. The result of adding the index of a given bit to the 15-bit - * 'base_dest_nodeid' field of the header corresponds to the + * 'base_dest_nasid' field of the header corresponds to the * destination nodeID associated with that specified bit. */ -struct bau_target_nodemask { - unsigned long bits[BITS_TO_LONGS(256)]; +struct pnmask { + unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; }; /* - * mask of cpu's on a node + * mask of cpu's on a uvhub * (during initialization we need to check that unsigned long has - * enough bits for max. cpu's per node) + * enough bits for max. cpu's per uvhub) */ struct bau_local_cpumask { - unsigned long bits; + unsigned long bits; }; /* @@ -117,94 +224,182 @@ struct bau_local_cpumask { * The payload is software-defined for INTD transactions */ struct bau_msg_payload { - unsigned long address; /* signifies a page or all TLB's - of the cpu */ + unsigned long address; /* signifies a page or all + TLB's of the cpu */ /* 64 bits */ - unsigned short sending_cpu; /* filled in by sender */ + unsigned short sending_cpu; /* filled in by sender */ /* 16 bits */ - unsigned short acknowledge_count;/* filled in by destination */ + unsigned short acknowledge_count; /* filled in by destination */ /* 16 bits */ - unsigned int reserved1:32; /* not usable */ + unsigned int reserved1:32; /* not usable */ }; /* - * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * UV1 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) * see table 4.2.3.0.1 in broacast_assist spec. */ -struct bau_msg_header { - unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ +struct uv1_bau_msg_header { + unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ /* bits 5:0 */ - unsigned int base_dest_nodeid:15; /* nasid>>1 (pnode) of */ - /* bits 20:6 */ /* first bit in node_map */ - unsigned int command:8; /* message type */ + unsigned int base_dest_nasid:15; /* nasid of the first bit */ + /* bits 20:6 */ /* in uvhub map */ + unsigned int command:8; /* message type */ /* bits 28:21 */ - /* 0x38: SN3net EndPoint Message */ - unsigned int rsvd_1:3; /* must be zero */ + /* 0x38: SN3net EndPoint Message */ + unsigned int rsvd_1:3; /* must be zero */ /* bits 31:29 */ - /* int will align on 32 bits */ - unsigned int rsvd_2:9; /* must be zero */ + /* int will align on 32 bits */ + unsigned int rsvd_2:9; /* must be zero */ /* bits 40:32 */ - /* Suppl_A is 56-41 */ - unsigned int payload_2a:8;/* becomes byte 16 of msg */ - /* bits 48:41 */ /* not currently using */ - unsigned int payload_2b:8;/* becomes byte 17 of msg */ - /* bits 56:49 */ /* not currently using */ - /* Address field (96:57) is never used as an - address (these are address bits 42:3) */ - unsigned int rsvd_3:1; /* must be zero */ + /* Suppl_A is 56-41 */ + unsigned int sequence:16; /* message sequence number */ + /* bits 56:41 */ /* becomes bytes 16-17 of msg */ + /* Address field (96:57) is + never used as an address + (these are address bits + 42:3) */ + + unsigned int rsvd_3:1; /* must be zero */ /* bit 57 */ - /* address bits 27:4 are payload */ - /* these 24 bits become bytes 12-14 of msg */ - unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */ + /* address bits 27:4 are payload */ + /* these next 24 (58-81) bits become bytes 12-14 of msg */ + /* bits 65:58 land in byte 12 */ + unsigned int replied_to:1; /* sent as 0 by the source to + byte 12 */ /* bit 58 */ - - unsigned int payload_1a:5;/* not currently used */ - /* bits 63:59 */ - unsigned int payload_1b:8;/* not currently used */ - /* bits 71:64 */ - unsigned int payload_1c:8;/* not currently used */ - /* bits 79:72 */ - unsigned int payload_1d:2;/* not currently used */ + unsigned int msg_type:3; /* software type of the + message */ + /* bits 61:59 */ + unsigned int canceled:1; /* message canceled, resource + is to be freed*/ + /* bit 62 */ + unsigned int payload_1a:1; /* not currently used */ + /* bit 63 */ + unsigned int payload_1b:2; /* not currently used */ + /* bits 65:64 */ + + /* bits 73:66 land in byte 13 */ + unsigned int payload_1ca:6; /* not currently used */ + /* bits 71:66 */ + unsigned int payload_1c:2; /* not currently used */ + /* bits 73:72 */ + + /* bits 81:74 land in byte 14 */ + unsigned int payload_1d:6; /* not currently used */ + /* bits 79:74 */ + unsigned int payload_1e:2; /* not currently used */ /* bits 81:80 */ - unsigned int rsvd_4:7; /* must be zero */ + unsigned int rsvd_4:7; /* must be zero */ /* bits 88:82 */ - unsigned int sw_ack_flag:1;/* software acknowledge flag */ + unsigned int swack_flag:1; /* software acknowledge flag */ /* bit 89 */ - /* INTD trasactions at destination are to - wait for software acknowledge */ - unsigned int rsvd_5:6; /* must be zero */ + /* INTD trasactions at + destination are to wait for + software acknowledge */ + unsigned int rsvd_5:6; /* must be zero */ /* bits 95:90 */ - unsigned int rsvd_6:5; /* must be zero */ + unsigned int rsvd_6:5; /* must be zero */ /* bits 100:96 */ - unsigned int int_both:1;/* if 1, interrupt both sockets on the blade */ + unsigned int int_both:1; /* if 1, interrupt both sockets + on the uvhub */ /* bit 101*/ - unsigned int fairness:3;/* usually zero */ + unsigned int fairness:3; /* usually zero */ /* bits 104:102 */ - unsigned int multilevel:1; /* multi-level multicast format */ + unsigned int multilevel:1; /* multi-level multicast + format */ /* bit 105 */ - /* 0 for TLB: endpoint multi-unicast messages */ - unsigned int chaining:1;/* next descriptor is part of this activation*/ + /* 0 for TLB: endpoint multi-unicast messages */ + unsigned int chaining:1; /* next descriptor is part of + this activation*/ /* bit 106 */ - unsigned int rsvd_7:21; /* must be zero */ + unsigned int rsvd_7:21; /* must be zero */ /* bits 127:107 */ }; /* + * UV2 Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor) + * see figure 9-2 of harp_sys.pdf + */ +struct uv2_bau_msg_header { + unsigned int base_dest_nasid:15; /* nasid of the first bit */ + /* bits 14:0 */ /* in uvhub map */ + unsigned int dest_subnodeid:5; /* must be 0x10, for the LB */ + /* bits 19:15 */ + unsigned int rsvd_1:1; /* must be zero */ + /* bit 20 */ + /* Address bits 59:21 */ + /* bits 25:2 of address (44:21) are payload */ + /* these next 24 bits become bytes 12-14 of msg */ + /* bits 28:21 land in byte 12 */ + unsigned int replied_to:1; /* sent as 0 by the source to + byte 12 */ + /* bit 21 */ + unsigned int msg_type:3; /* software type of the + message */ + /* bits 24:22 */ + unsigned int canceled:1; /* message canceled, resource + is to be freed*/ + /* bit 25 */ + unsigned int payload_1:3; /* not currently used */ + /* bits 28:26 */ + + /* bits 36:29 land in byte 13 */ + unsigned int payload_2a:3; /* not currently used */ + unsigned int payload_2b:5; /* not currently used */ + /* bits 36:29 */ + + /* bits 44:37 land in byte 14 */ + unsigned int payload_3:8; /* not currently used */ + /* bits 44:37 */ + + unsigned int rsvd_2:7; /* reserved */ + /* bits 51:45 */ + unsigned int swack_flag:1; /* software acknowledge flag */ + /* bit 52 */ + unsigned int rsvd_3a:3; /* must be zero */ + unsigned int rsvd_3b:8; /* must be zero */ + unsigned int rsvd_3c:8; /* must be zero */ + unsigned int rsvd_3d:3; /* must be zero */ + /* bits 74:53 */ + unsigned int fairness:3; /* usually zero */ + /* bits 77:75 */ + + unsigned int sequence:16; /* message sequence number */ + /* bits 93:78 Suppl_A */ + unsigned int chaining:1; /* next descriptor is part of + this activation*/ + /* bit 94 */ + unsigned int multilevel:1; /* multi-level multicast + format */ + /* bit 95 */ + unsigned int rsvd_4:24; /* ordered / source node / + source subnode / aging + must be zero */ + /* bits 119:96 */ + unsigned int command:8; /* message type */ + /* bits 127:120 */ +}; + +/* * The activation descriptor: * The format of the message to send, plus all accompanying control * Should be 64 bytes */ struct bau_desc { - struct bau_target_nodemask distribution; + struct pnmask distribution; /* * message template, consisting of header and payload: */ - struct bau_msg_header header; - struct bau_msg_payload payload; + union bau_msg_header { + struct uv1_bau_msg_header uv1_hdr; + struct uv2_bau_msg_header uv2_hdr; + } header; + + struct bau_msg_payload payload; }; -/* +/* UV1: * -payload-- ---------header------ * bytes 0-11 bits 41-56 bits 58-81 * A B (2) C (3) @@ -214,6 +409,16 @@ struct bau_desc { * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) * ------------payload queue----------- */ +/* UV2: + * -payload-- ---------header------ + * bytes 0-11 bits 70-78 bits 21-44 + * A B (2) C (3) + * + * A/B/C are moved to: + * A C B + * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector) + * ------------payload queue----------- + */ /* * The payload queue on the destination side is an array of these. @@ -221,111 +426,358 @@ struct bau_desc { * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17 * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120) * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from - * sw_ack_vector and payload_2) + * swack_vec and payload_2) * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload * operation." */ -struct bau_payload_queue_entry { - unsigned long address; /* signifies a page or all TLB's - of the cpu */ +struct bau_pq_entry { + unsigned long address; /* signifies a page or all TLB's + of the cpu */ /* 64 bits, bytes 0-7 */ - - unsigned short sending_cpu; /* cpu that sent the message */ + unsigned short sending_cpu; /* cpu that sent the message */ /* 16 bits, bytes 8-9 */ - - unsigned short acknowledge_count; /* filled in by destination */ + unsigned short acknowledge_count; /* filled in by destination */ /* 16 bits, bytes 10-11 */ - - unsigned short replied_to:1; /* sent as 0 by the source */ - /* 1 bit */ - unsigned short unused1:7; /* not currently using */ - /* 7 bits: byte 12) */ - - unsigned char unused2[2]; /* not currently using */ - /* bytes 13-14 */ - - unsigned char sw_ack_vector; /* filled in by the hardware */ + /* these next 3 bytes come from bits 58-81 of the message header */ + unsigned short replied_to:1; /* sent as 0 by the source */ + unsigned short msg_type:3; /* software message type */ + unsigned short canceled:1; /* sent as 0 by the source */ + unsigned short unused1:3; /* not currently using */ + /* byte 12 */ + unsigned char unused2a; /* not currently using */ + /* byte 13 */ + unsigned char unused2; /* not currently using */ + /* byte 14 */ + unsigned char swack_vec; /* filled in by the hardware */ /* byte 15 (bits 127:120) */ - - unsigned char unused4[3]; /* not currently using bytes 17-19 */ - /* bytes 17-19 */ - - int number_of_cpus; /* filled in at destination */ + unsigned short sequence; /* message sequence number */ + /* bytes 16-17 */ + unsigned char unused4[2]; /* not currently using bytes 18-19 */ + /* bytes 18-19 */ + int number_of_cpus; /* filled in at destination */ /* 32 bits, bytes 20-23 (aligned) */ - - unsigned char unused5[8]; /* not using */ + unsigned char unused5[8]; /* not using */ /* bytes 24-31 */ }; -/* - * one for every slot in the destination payload queue - */ -struct bau_msg_status { - struct bau_local_cpumask seen_by; /* map of cpu's */ +struct msg_desc { + struct bau_pq_entry *msg; + int msg_slot; + struct bau_pq_entry *queue_first; + struct bau_pq_entry *queue_last; +}; + +struct reset_args { + int sender; }; /* - * one for every slot in the destination software ack resources + * This structure is allocated per_cpu for UV TLB shootdown statistics. */ -struct bau_sw_ack_status { - struct bau_payload_queue_entry *msg; /* associated message */ - int watcher; /* cpu monitoring, or -1 */ +struct ptc_stats { + /* sender statistics */ + unsigned long s_giveup; /* number of fall backs to + IPI-style flushes */ + unsigned long s_requestor; /* number of shootdown + requests */ + unsigned long s_stimeout; /* source side timeouts */ + unsigned long s_dtimeout; /* destination side timeouts */ + unsigned long s_strongnacks; /* number of strong nack's */ + unsigned long s_time; /* time spent in sending side */ + unsigned long s_retriesok; /* successful retries */ + unsigned long s_ntargcpu; /* total number of cpu's + targeted */ + unsigned long s_ntargself; /* times the sending cpu was + targeted */ + unsigned long s_ntarglocals; /* targets of cpus on the local + blade */ + unsigned long s_ntargremotes; /* targets of cpus on remote + blades */ + unsigned long s_ntarglocaluvhub; /* targets of the local hub */ + unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ + unsigned long s_ntarguvhub; /* total number of uvhubs + targeted */ + unsigned long s_ntarguvhub16; /* number of times target + hubs >= 16*/ + unsigned long s_ntarguvhub8; /* number of times target + hubs >= 8 */ + unsigned long s_ntarguvhub4; /* number of times target + hubs >= 4 */ + unsigned long s_ntarguvhub2; /* number of times target + hubs >= 2 */ + unsigned long s_ntarguvhub1; /* number of times target + hubs == 1 */ + unsigned long s_resets_plug; /* ipi-style resets from plug + state */ + unsigned long s_resets_timeout; /* ipi-style resets from + timeouts */ + unsigned long s_busy; /* status stayed busy past + s/w timer */ + unsigned long s_throttles; /* waits in throttle */ + unsigned long s_retry_messages; /* retry broadcasts */ + unsigned long s_bau_reenabled; /* for bau enable/disable */ + unsigned long s_bau_disabled; /* for bau enable/disable */ + unsigned long s_uv2_wars; /* uv2 workaround, perm. busy */ + unsigned long s_uv2_wars_hw; /* uv2 workaround, hiwater */ + unsigned long s_uv2_war_waits; /* uv2 workaround, long waits */ + /* destination statistics */ + unsigned long d_alltlb; /* times all tlb's on this + cpu were flushed */ + unsigned long d_onetlb; /* times just one tlb on this + cpu was flushed */ + unsigned long d_multmsg; /* interrupts with multiple + messages */ + unsigned long d_nomsg; /* interrupts with no message */ + unsigned long d_time; /* time spent on destination + side */ + unsigned long d_requestee; /* number of messages + processed */ + unsigned long d_retries; /* number of retry messages + processed */ + unsigned long d_canceled; /* number of messages canceled + by retries */ + unsigned long d_nocanceled; /* retries that found nothing + to cancel */ + unsigned long d_resets; /* number of ipi-style requests + processed */ + unsigned long d_rcanceled; /* number of messages canceled + by resets */ +}; + +struct tunables { + int *tunp; + int deflt; +}; + +struct hub_and_pnode { + short uvhub; + short pnode; +}; + +struct socket_desc { + short num_cpus; + short cpu_number[MAX_CPUS_PER_SOCKET]; +}; + +struct uvhub_desc { + unsigned short socket_mask; + short num_cpus; + short uvhub; + short pnode; + struct socket_desc socket[2]; }; /* - * one on every node and per-cpu; to locate the software tables + * one per-cpu; to locate the software tables */ struct bau_control { - struct bau_desc *descriptor_base; - struct bau_payload_queue_entry *bau_msg_head; - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; - struct bau_msg_status *msg_statuses; - int *watching; /* pointer to array */ + struct bau_desc *descriptor_base; + struct bau_pq_entry *queue_first; + struct bau_pq_entry *queue_last; + struct bau_pq_entry *bau_msg_head; + struct bau_control *uvhub_master; + struct bau_control *socket_master; + struct ptc_stats *statp; + cpumask_t *cpumask; + unsigned long timeout_interval; + unsigned long set_bau_on_time; + atomic_t active_descriptor_count; + int plugged_tries; + int timeout_tries; + int ipi_attempts; + int conseccompletes; + int baudisabled; + int set_bau_off; + short cpu; + short osnode; + short uvhub_cpu; + short uvhub; + short uvhub_version; + short cpus_in_socket; + short cpus_in_uvhub; + short partition_base_pnode; + short using_desc; /* an index, like uvhub_cpu */ + unsigned int inuse_map; + unsigned short message_number; + unsigned short uvhub_quiesce; + short socket_acknowledge_count[DEST_Q_SIZE]; + cycles_t send_message; + spinlock_t uvhub_lock; + spinlock_t queue_lock; + /* tunables */ + int max_concurr; + int max_concurr_const; + int plugged_delay; + int plugsb4reset; + int timeoutsb4reset; + int ipi_reset_limit; + int complete_threshold; + int cong_response_us; + int cong_reps; + int cong_period; + unsigned long clocks_per_100_usec; + cycles_t period_time; + long period_requests; + struct hub_and_pnode *thp; }; -/* - * This structure is allocated per_cpu for UV TLB shootdown statistics. - */ -struct ptc_stats { - unsigned long ptc_i; /* number of IPI-style flushes */ - unsigned long requestor; /* number of nodes this cpu sent to */ - unsigned long requestee; /* times cpu was remotely requested */ - unsigned long alltlb; /* times all tlb's on this cpu were flushed */ - unsigned long onetlb; /* times just one tlb on this cpu was flushed */ - unsigned long s_retry; /* retries on source side timeouts */ - unsigned long d_retry; /* retries on destination side timeouts */ - unsigned long sflush; /* cycles spent in uv_flush_tlb_others */ - unsigned long dflush; /* cycles spent on destination side */ - unsigned long retriesok; /* successes on retries */ - unsigned long nomsg; /* interrupts with no message */ - unsigned long multmsg; /* interrupts with multiple messages */ - unsigned long ntargeted;/* nodes targeted */ -}; - -static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp) +static inline unsigned long read_mmr_uv2_status(void) +{ + return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2); +} + +static inline void write_mmr_data_broadcast(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image); +} + +static inline void write_mmr_descriptor_base(int pnode, unsigned long mmr_image) { - return constant_test_bit(node, &dstp->bits[0]); + write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image); } -static inline void bau_node_set(int node, struct bau_target_nodemask *dstp) + +static inline void write_mmr_activation(unsigned long index) { - __set_bit(node, &dstp->bits[0]); + write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); } -static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits) + +static inline void write_gmmr_activation(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image); +} + +static inline void write_mmr_payload_first(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image); +} + +static inline void write_mmr_payload_tail(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image); +} + +static inline void write_mmr_payload_last(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image); +} + +static inline void write_mmr_misc_control(int pnode, unsigned long mmr_image) +{ + write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); +} + +static inline unsigned long read_mmr_misc_control(int pnode) +{ + return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL); +} + +static inline void write_mmr_sw_ack(unsigned long mr) +{ + uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); +} + +static inline void write_gmmr_sw_ack(int pnode, unsigned long mr) +{ + write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr); +} + +static inline unsigned long read_mmr_sw_ack(void) +{ + return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); +} + +static inline unsigned long read_gmmr_sw_ack(int pnode) +{ + return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); +} + +static inline void write_mmr_data_config(int pnode, unsigned long mr) +{ + uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr); +} + +static inline int bau_uvhub_isset(int uvhub, struct pnmask *dstp) +{ + return constant_test_bit(uvhub, &dstp->bits[0]); +} +static inline void bau_uvhub_set(int pnode, struct pnmask *dstp) +{ + __set_bit(pnode, &dstp->bits[0]); +} +static inline void bau_uvhubs_clear(struct pnmask *dstp, + int nbits) { bitmap_zero(&dstp->bits[0], nbits); } +static inline int bau_uvhub_weight(struct pnmask *dstp) +{ + return bitmap_weight((unsigned long *)&dstp->bits[0], + UV_DISTRIBUTION_SIZE); +} static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits) { bitmap_zero(&dstp->bits, nbits); } -#define cpubit_isset(cpu, bau_local_cpumask) \ - test_bit((cpu), (bau_local_cpumask).bits) - extern void uv_bau_message_intr1(void); +#ifdef CONFIG_TRACING +#define trace_uv_bau_message_intr1 uv_bau_message_intr1 +#endif extern void uv_bau_timeout_intr1(void); +struct atomic_short { + short counter; +}; + +/* + * atomic_read_short - read a short atomic variable + * @v: pointer of type atomic_short + * + * Atomically reads the value of @v. + */ +static inline int atomic_read_short(const struct atomic_short *v) +{ + return v->counter; +} + +/* + * atom_asr - add and return a short int + * @i: short value to add + * @v: pointer of type atomic_short + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atom_asr(short i, struct atomic_short *v) +{ + short __i = i; + asm volatile(LOCK_PREFIX "xaddw %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +/* + * conditionally add 1 to *v, unless *v is >= u + * return 0 if we cannot add 1 to *v because it is >= u + * return 1 if we can add 1 to *v because it is < u + * the add is atomic + * + * This is close to atomic_add_unless(), but this allows the 'u' value + * to be lowered below the current 'v'. atomic_add_unless can only stop + * on equal. + */ +static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) +{ + spin_lock(lock); + if (atomic_read(v) >= u) { + spin_unlock(lock); + return 0; + } + atomic_inc(v); + spin_unlock(lock); + return 1; +} + #endif /* _ASM_X86_UV_UV_BAU_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/uv/uv.h linux-2.6.32.ovz/arch/x86/include/asm/uv/uv.h --- linux-2.6.32/arch/x86/include/asm/uv/uv.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/uv/uv.h 2015-03-10 13:22:15.000000000 -0700 @@ -11,6 +11,7 @@ struct mm_struct; extern enum uv_system_type get_uv_system_type(void); extern int is_uv_system(void); extern void uv_cpu_init(void); +extern void uv_nmi_init(void); extern void uv_system_init(void); extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, diff -uprN linux-2.6.32/arch/x86/include/asm/uv/uv_hub.h linux-2.6.32.ovz/arch/x86/include/asm/uv/uv_hub.h --- linux-2.6.32/arch/x86/include/asm/uv/uv_hub.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/uv/uv_hub.h 2015-03-10 13:24:14.000000000 -0700 @@ -5,7 +5,7 @@ * * SGI UV architectural definitions * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_HUB_H @@ -31,20 +31,27 @@ * contiguous (although various IO spaces may punch holes in * it).. * - * N - Number of bits in the node portion of a socket physical - * address. + * N - Number of bits in the node portion of a socket physical + * address. * - * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of - * routers always have low bit of 1, C/MBricks have low bit - * equal to 0. Most addressing macros that target UV hub chips - * right shift the NASID by 1 to exclude the always-zero bit. - * NASIDs contain up to 15 bits. + * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of + * routers always have low bit of 1, C/MBricks have low bit + * equal to 0. Most addressing macros that target UV hub chips + * right shift the NASID by 1 to exclude the always-zero bit. + * NASIDs contain up to 15 bits. * * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead * of nasids. * - * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant - * of the nasid for socket usage. + * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant + * of the nasid for socket usage. + * + * GPA - (global physical address) a socket physical address converted + * so that it can be used by the GRU as a global address. Socket + * physical addresses 1) need additional NASID (node) bits added + * to the high end of the address, and 2) unaliased if the + * partition does not have a physical address 0. In addition, on + * UV2 rev 1, GPAs need the gnode left shifted to bits 39 or 40. * * * NumaLink Global Physical Address Format: @@ -71,13 +78,15 @@ * * * APICID format - * NOTE!!!!!! This is the current format of the APICID. However, code - * should assume that this will change in the future. Use functions - * in this file for all APICID bit manipulations and conversion. - * - * 1111110000000000 - * 5432109876543210 - * pppppppppplc0cch + * NOTE!!!!!! This is the current format of the APICID. However, code + * should assume that this will change in the future. Use functions + * in this file for all APICID bit manipulations and conversion. + * + * 1111110000000000 + * 5432109876543210 + * pppppppppplc0cch Nehalem-EX (12 bits in hdw reg) + * ppppppppplcc0cch Westmere-EX (12 bits in hdw reg) + * pppppppppppcccch SandyBridge (15 bits in hdw reg) * sssssssssss * * p = pnode bits @@ -86,14 +95,15 @@ * h = hyperthread * s = bits that are in the SOCKET_ID CSR * - * Note: Processor only supports 12 bits in the APICID register. The ACPI + * Note: Processor may support fewer bits in the APICID register. The ACPI * tables hold all 16 bits. Software needs to be aware of this. * - * Unless otherwise specified, all references to APICID refer to - * the FULL value contained in ACPI tables, not the subset in the - * processor APICID register. + * Unless otherwise specified, all references to APICID refer to + * the FULL value contained in ACPI tables, not the subset in the + * processor APICID register. */ +#define UV2_HUB_KABI_HACKS 1 /* * Maximum number of bricks in all partitions and in all coherency domains. @@ -137,6 +147,47 @@ struct uv_hub_info_s { unsigned long global_mmr_base; unsigned long gpa_mask; unsigned int gnode_extra; +#ifndef UV2_HUB_KABI_HACKS + /* + * The following breaks the KABI. The fields exist in holes + * in the original structure. They are accessed thru + * macros that use casting hacks. + */ + unsigned char hub_revision; + unsigned char apic_pnode_shift; + unsigned char m_shift; + unsigned char n_lshift; +#endif + unsigned long gnode_upper; + unsigned long lowmem_remap_top; + unsigned long lowmem_remap_base; + unsigned short pnode; + unsigned short pnode_mask; + unsigned short coherency_domain_number; + unsigned short numa_blade_id; + unsigned char blade_processor_id; + unsigned char m_val; + unsigned char n_val; + struct uv_scir_s scir; +}; + +#ifdef UV2_HUB_KABI_HACKS +/* ---------------------- BEGIN UGLY HACK --------------------------*/ +/* Temp hacks to prevent breaking the KABI. Use casting macros + * to access unused space in the original definition of uv_hub_info_s. + */ +#define uv_hub_info_base (void *)(&uv_hub_info->global_mmr_base) +#define uv_cpu_hub_info_base(c) (void *)(&uv_cpu_hub_info(c)->global_mmr_base) + +#define uv_hub_info_n_lshift_offset 22 +#define uv_hub_info_m_shift_offset 23 + +struct uv_hub_info_k { + unsigned long global_mmr_base; + unsigned long gpa_mask; + unsigned int gnode_extra; + unsigned char hub_revision; /* breaks kABI */ + unsigned char apic_pnode_shift; /* breaks kABI */ unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -150,27 +201,156 @@ struct uv_hub_info_s { struct uv_scir_s scir; }; +#define uv_cpu_hub_info_hub_revision(c) \ + (((struct uv_hub_info_k *)uv_cpu_hub_info(c))->hub_revision) + +#define uv_cpu_hub_info_apic_pnode_shift(c) \ + (((struct uv_hub_info_k *)uv_cpu_hub_info(c))->apic_pnode_shift) + +#define uv_cpu_hub_info_m_shift(c) \ + *((unsigned char *)(uv_cpu_hub_info_base(c) + uv_hub_info_m_shift_offset)) + +#define uv_cpu_hub_info_n_lshift(c) \ + *((unsigned char *)(uv_cpu_hub_info_base(c) + uv_hub_info_n_lshift_offset)) + +#define uv_hub_info_hub_revision \ + (((struct uv_hub_info_k *)uv_hub_info)->hub_revision) + +#define uv_hub_info_apic_pnode_shift \ + (((struct uv_hub_info_k *)uv_hub_info)->apic_pnode_shift) + +#define uv_hub_info_m_shift \ + *((unsigned char *)(uv_hub_info_base + uv_hub_info_m_shift_offset)) + +#define uv_hub_info_n_lshift \ + *((unsigned char *)(uv_hub_info_base + uv_hub_info_n_lshift_offset)) + +#endif + +/* ---------------------- END UGLY HACK --------------------------*/ + DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); -#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) +#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) #define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) /* + * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2 + * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE. + * This is a software convention - NOT the hardware revision numbers in + * the hub chip. + */ +#define UV1_HUB_REVISION_BASE 1 +#define UV2_HUB_REVISION_BASE 3 +#define UV3_HUB_REVISION_BASE 5 + +static inline int is_uv1_hub(void) +{ + if (uv_hub_info_hub_revision == 0) + return 1; + return uv_hub_info_hub_revision < UV2_HUB_REVISION_BASE; +} + +static inline int is_uv2_hub(void) +{ + return ((uv_hub_info_hub_revision >= UV2_HUB_REVISION_BASE) && + (uv_hub_info_hub_revision < UV3_HUB_REVISION_BASE)); +} + +static inline int is_uv3_hub(void) +{ + return uv_hub_info_hub_revision >= UV3_HUB_REVISION_BASE; +} + +static inline int is_uv_hub(void) +{ + return uv_hub_info_hub_revision; +} + +/* code common to uv2 and uv3 only */ +static inline int is_uvx_hub(void) +{ + return uv_hub_info_hub_revision >= UV2_HUB_REVISION_BASE; +} + +#define UV_HUB_INFO_EXTRA_FIELDS L1_CACHE_BYTES-2 +struct uv_hub_info_extra_s { + unsigned char apic_pnode_shift; + unsigned char hub_type; + unsigned char future[UV_HUB_INFO_EXTRA_FIELDS]; +}; + +DECLARE_PER_CPU(struct uv_hub_info_extra_s, __uv_hub_info_extra); +#define uv_hub_info_extra (&__get_cpu_var(__uv_hub_info_extra)) +#define uv_cpu_hub_info_extra(cpu) (&per_cpu(__uv_hub_info_extra, cpu)) + +static inline int is_uv2_1_hub(void) +{ + if (uv_hub_info_hub_revision == 0) + return 0; + return uv_hub_info_hub_revision == UV2_HUB_REVISION_BASE; +} + +static inline int is_uv2_2_hub(void) +{ + if (uv_hub_info_hub_revision == 0) + return 0; + return uv_hub_info_hub_revision == UV2_HUB_REVISION_BASE + 1; +} + +union uvh_apicid { + unsigned long v; + struct uvh_apicid_s { + unsigned long local_apic_mask : 24; + unsigned long local_apic_shift : 5; + unsigned long unused1 : 3; + unsigned long pnode_mask : 24; + unsigned long pnode_shift : 5; + unsigned long unused2 : 3; + } s; +}; + +/* * Local & Global MMR space macros. - * Note: macros are intended to be used ONLY by inline functions - * in this file - not by other kernel code. - * n - NASID (full 15-bit global nasid) - * g - GNODE (full 15-bit global nasid, right shifted 1) - * p - PNODE (local part of nsids, right shifted 1) + * Note: macros are intended to be used ONLY by inline functions + * in this file - not by other kernel code. + * n - NASID (full 15-bit global nasid) + * g - GNODE (full 15-bit global nasid, right shifted 1) + * p - PNODE (local part of nsids, right shifted 1) */ #define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) #define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) #define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) -#define UV_LOCAL_MMR_BASE 0xf4000000UL -#define UV_GLOBAL_MMR32_BASE 0xf8000000UL +#define UV1_LOCAL_MMR_BASE 0xf4000000UL +#define UV1_GLOBAL_MMR32_BASE 0xf8000000UL +#define UV1_LOCAL_MMR_SIZE (64UL * 1024 * 1024) +#define UV1_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) + +#define UV2_LOCAL_MMR_BASE 0xfa000000UL +#define UV2_GLOBAL_MMR32_BASE 0xfc000000UL +#define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024) +#define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024) + +#define UV3_LOCAL_MMR_BASE 0xfa000000UL +#define UV3_GLOBAL_MMR32_BASE 0xfc000000UL +#define UV3_LOCAL_MMR_SIZE (32UL * 1024 * 1024) +#define UV3_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024) + +#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE : \ + (is_uv2_hub() ? UV2_LOCAL_MMR_BASE : \ + UV3_LOCAL_MMR_BASE)) +#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE :\ + (is_uv2_hub() ? UV2_GLOBAL_MMR32_BASE :\ + UV3_GLOBAL_MMR32_BASE)) +#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \ + (is_uv2_hub() ? UV2_LOCAL_MMR_SIZE : \ + UV3_LOCAL_MMR_SIZE)) +#define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\ + (is_uv2_hub() ? UV2_GLOBAL_MMR32_SIZE :\ + UV3_GLOBAL_MMR32_SIZE)) #define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) -#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) -#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) + +#define UV_GLOBAL_GRU_MMR_BASE 0x4000000 #define UV_GLOBAL_MMR32_PNODE_SHIFT 15 #define UV_GLOBAL_MMR64_PNODE_SHIFT 26 @@ -180,8 +360,11 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __ #define UV_GLOBAL_MMR64_PNODE_BITS(p) \ (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) +#define UVH_APICID 0x002D0E00L #define UV_APIC_PNODE_SHIFT 6 +#define UV_APICID_HIBIT_MASK 0xffff0000 + /* Local Bus from cpu's perspective */ #define LOCAL_BUS_BASE 0x1c00000 #define LOCAL_BUS_SIZE (4 * 1024 * 1024) @@ -213,8 +396,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __ /* * Macros for converting between kernel virtual addresses, socket local physical * addresses, and UV global physical addresses. - * Note: use the standard __pa() & __va() macros for converting - * between socket virtual and socket physical addresses. + * Note: use the standard __pa() & __va() macros for converting + * between socket virtual and socket physical addresses. */ /* socket phys RAM --> UV global physical address */ @@ -222,7 +405,10 @@ static inline unsigned long uv_soc_phys_ { if (paddr < uv_hub_info->lowmem_remap_top) paddr |= uv_hub_info->lowmem_remap_base; - return paddr | uv_hub_info->gnode_upper; + paddr |= uv_hub_info->gnode_upper; + paddr = ((paddr << uv_hub_info_m_shift) >> uv_hub_info_m_shift) | + ((paddr >> uv_hub_info->m_val) << uv_hub_info_n_lshift); + return paddr; } @@ -232,10 +418,33 @@ static inline unsigned long uv_gpa(void return uv_soc_phys_ram_to_gpa(__pa(v)); } -/* gnode -> pnode */ +/* Top two bits indicate the requested address is in MMR space. */ +static inline int +uv_gpa_in_mmr_space(unsigned long gpa) +{ + return (gpa >> 62) == 0x3UL; +} + +/* UV global physical address --> socket phys RAM */ +static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) +{ + unsigned long paddr; + unsigned long remap_base = uv_hub_info->lowmem_remap_base; + unsigned long remap_top = uv_hub_info->lowmem_remap_top; + + gpa = ((gpa << uv_hub_info_m_shift) >> uv_hub_info_m_shift) | + ((gpa >> uv_hub_info_n_lshift) << uv_hub_info->m_val); + paddr = gpa & uv_hub_info->gpa_mask; + if (paddr >= remap_base && paddr < remap_base + remap_top) + paddr -= remap_base; + return paddr; +} + + +/* gpa -> pnode */ static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) { - return gpa >> uv_hub_info->m_val; + return gpa >> uv_hub_info_n_lshift; } /* gpa -> pnode */ @@ -246,6 +455,12 @@ static inline int uv_gpa_to_pnode(unsign return uv_gpa_to_gnode(gpa) & n_mask; } +/* gpa -> node offset*/ +static inline unsigned long uv_gpa_to_offset(unsigned long gpa) +{ + return (gpa << uv_hub_info_m_shift) >> uv_hub_info_m_shift; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { @@ -258,28 +473,36 @@ static inline void *uv_pnode_offset_to_v */ static inline int uv_apicid_to_pnode(int apicid) { - return (apicid >> UV_APIC_PNODE_SHIFT); + return (apicid >> uv_hub_info_apic_pnode_shift); +} + +/* + * Convert an apicid to the socket number on the blade + */ +static inline int uv_apicid_to_socket(int apicid) +{ + if (is_uv1_hub()) + return (apicid >> (uv_hub_info_apic_pnode_shift - 1)) & 1; + else + return 0; } /* * Access global MMRs using the low memory MMR32 space. This region supports * faster MMR access but not all MMRs are accessible in this space. */ -static inline unsigned long *uv_global_mmr32_address(int pnode, - unsigned long offset) +static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR32_BASE | UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr32(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr32_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr32(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset) { return readq(uv_global_mmr32_address(pnode, offset)); } @@ -288,25 +511,42 @@ static inline unsigned long uv_read_glob * Access Global MMR space using the MMR space located at the top of physical * memory. */ -static inline unsigned long *uv_global_mmr64_address(int pnode, - unsigned long offset) +static inline volatile void __iomem *uv_global_mmr64_address(int pnode, unsigned long offset) { return __va(UV_GLOBAL_MMR64_BASE | UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); } -static inline void uv_write_global_mmr64(int pnode, unsigned long offset, - unsigned long val) +static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val) { writeq(val, uv_global_mmr64_address(pnode, offset)); } -static inline unsigned long uv_read_global_mmr64(int pnode, - unsigned long offset) +static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset) { return readq(uv_global_mmr64_address(pnode, offset)); } +static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val) +{ + writeb(val, uv_global_mmr64_address(pnode, offset)); +} + +static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset) +{ + return readb(uv_global_mmr64_address(pnode, offset)); +} + +/* + * Global MMR space addresses when referenced by the GRU. (GRU does + * NOT use socket addressing). + */ +static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset) +{ + return UV_GLOBAL_GRU_MMR_BASE | offset | + ((unsigned long)pnode << uv_hub_info->m_val); +} + /* * Access hub local MMRs. Faster than using global space but only local MMRs * are accessible. @@ -346,6 +586,16 @@ struct uv_blade_info { unsigned short pnode; short memory_nid; }; + +/* + * Upstream this is part of uv_blade_info, but that broke KABI for SGI modules, + * so we have a parallel data structure. + */ + +struct uv_blade_info_nmi { + spinlock_t nmi_lock; + unsigned long nmi_count; +}; extern struct uv_blade_info *uv_blade_info; extern short *uv_node_to_blade; extern short *uv_cpu_to_blade; @@ -426,14 +676,30 @@ static inline void uv_set_scir_bits(unsi } } +static inline unsigned long uv_scir_offset(int apicid) +{ + return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f); +} + static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) { if (uv_cpu_hub_info(cpu)->scir.state != value) { + uv_write_global_mmr8(uv_cpu_to_pnode(cpu), + uv_cpu_hub_info(cpu)->scir.offset, value); uv_cpu_hub_info(cpu)->scir.state = value; - uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value); } } +extern unsigned int uv_apicid_hibits; +static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) +{ + apicid |= uv_apicid_hibits; + return (1UL << UVH_IPI_INT_SEND_SHFT) | + ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | + (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); +} + static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) { unsigned long val; @@ -442,12 +708,23 @@ static inline void uv_hub_send_ipi(int p if (vector == NMI_VECTOR) dmode = dest_NMI; - val = (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | - (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | - (vector << UVH_IPI_INT_VECTOR_SHFT); + val = uv_hub_ipi_value(apicid, vector, dmode); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } +/* + * Get the minimum revision number of the hub chips within the partition. + * 1 - UV1 rev 1.0 initial silicon + * 2 - UV1 rev 2.0 production silicon + * 3 - UV2 rev 1.0 initial silicon + * 5 - UV3 rev 1.0 initial silicon + */ +static inline int uv_get_min_hub_revision_id(void) +{ + if (uv_hub_info_hub_revision == 0) + return 2; + return uv_hub_info_hub_revision; +} + #endif /* CONFIG_X86_64 */ #endif /* _ASM_X86_UV_UV_HUB_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/uv/uv_irq.h linux-2.6.32.ovz/arch/x86/include/asm/uv/uv_irq.h --- linux-2.6.32/arch/x86/include/asm/uv/uv_irq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/uv/uv_irq.h 2015-03-10 13:21:10.000000000 -0700 @@ -25,12 +25,14 @@ struct uv_IO_APIC_route_entry { dest : 32; }; -extern struct irq_chip uv_irq_chip; - -extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); -extern void arch_disable_uv_irq(int, unsigned long); +enum { + UV_AFFINITY_ALL, + UV_AFFINITY_NODE, + UV_AFFINITY_CPU +}; -extern int uv_setup_irq(char *, int, int, unsigned long); -extern void uv_teardown_irq(unsigned int, int, unsigned long); +extern int uv_irq_2_mmr_info(int, unsigned long *, int *); +extern int uv_setup_irq(char *, int, int, unsigned long, int); +extern void uv_teardown_irq(unsigned int); #endif /* _ASM_X86_UV_UV_IRQ_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/uv/uv_mmrs.h linux-2.6.32.ovz/arch/x86/include/asm/uv/uv_mmrs.h --- linux-2.6.32/arch/x86/include/asm/uv/uv_mmrs.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/uv/uv_mmrs.h 2015-03-10 13:24:14.000000000 -0700 @@ -1,4 +1,3 @@ - /* * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -6,284 +5,497 @@ * * SGI UV MMR definitions * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #ifndef _ASM_X86_UV_UV_MMRS_H #define _ASM_X86_UV_UV_MMRS_H +/* + * This file contains MMR definitions for all UV hubs types. + * + * To minimize coding differences between hub types, the symbols are + * grouped by architecture types. + * + * UVH - definitions common to all UV hub types. + * UVXH - definitions common to all UV eXtended hub types (currently 2 & 3). + * UV1H - definitions specific to UV type 1 hub. + * UV2H - definitions specific to UV type 2 hub. + * UV3H - definitions specific to UV type 3 hub. + * + * So in general, MMR addresses and structures are identical on all hubs types. + * These MMRs are identified as: + * #define UVH_xxx
+ * union uvh_xxx { + * unsigned long v; + * struct uvh_int_cmpd_s { + * } s; + * }; + * + * If the MMR exists on all hub types but have different addresses: + * #define UV1Hxxx a + * #define UV2Hxxx b + * #define UV3Hxxx c + * #define UVHxxx (is_uv1_hub() ? UV1Hxxx : + * (is_uv2_hub() ? UV2Hxxx : + * UV3Hxxx)) + * + * If the MMR exists on all hub types > 1 but have different addresses: + * #define UV2Hxxx b + * #define UV3Hxxx c + * #define UVXHxxx (is_uv2_hub() ? UV2Hxxx : + * UV3Hxxx)) + * + * union uvh_xxx { + * unsigned long v; + * struct uvh_xxx_s { # Common fields only + * } s; + * struct uv1h_xxx_s { # Full UV1 definition (*) + * } s1; + * struct uv2h_xxx_s { # Full UV2 definition (*) + * } s2; + * struct uv3h_xxx_s { # Full UV3 definition (*) + * } s3; + * }; + * (* - if present and different than the common struct) + * + * Only essential differences are enumerated. For example, if the address is + * the same for all UV's, only a single #define is generated. Likewise, + * if the contents is the same for all hubs, only the "s" structure is + * generated. + * + * If the MMR exists on ONLY 1 type of hub, no generic definition is + * generated: + * #define UVnH_xxx + * union uvnh_xxx { + * unsigned long v; + * struct uvh_int_cmpd_s { + * } sn; + * }; + * + * (GEN Flags: mflags_opt= undefs=0 UV23=UVXH) + */ + #define UV_MMR_ENABLE (1UL << 63) +#define UV1_HUB_PART_NUMBER 0x88a5 +#define UV2_HUB_PART_NUMBER 0x8eb8 +#define UV2_HUB_PART_NUMBER_X 0x1111 +#define UV3_HUB_PART_NUMBER 0x9578 +#define UV3_HUB_PART_NUMBER_X 0x4321 + +/* Compat: Indicate which UV Hubs are supported. */ +#define UV2_HUB_IS_SUPPORTED 1 +#define UV3_HUB_IS_SUPPORTED 1 + +/* ========================================================================= */ +/* UVH_BAU_DATA_BROADCAST */ +/* ========================================================================= */ +#define UVH_BAU_DATA_BROADCAST 0x61688UL +#define UVH_BAU_DATA_BROADCAST_32 0x440 + +#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 +#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL + +union uvh_bau_data_broadcast_u { + unsigned long v; + struct uvh_bau_data_broadcast_s { + unsigned long enable:1; /* RW */ + unsigned long rsvd_1_63:63; + } s; +}; + /* ========================================================================= */ /* UVH_BAU_DATA_CONFIG */ /* ========================================================================= */ -#define UVH_LB_BAU_MISC_CONTROL 0x320170UL -#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL -/* 1011 timebase 7 (168millisec) * 3 ticks -> 500ms */ #define UVH_BAU_DATA_CONFIG 0x61680UL -#define UVH_BAU_DATA_CONFIG_32 0x0438 +#define UVH_BAU_DATA_CONFIG_32 0x438 -#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 -#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 -#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 -#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 -#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_BAU_DATA_CONFIG_P_SHFT 13 -#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_BAU_DATA_CONFIG_T_SHFT 15 -#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_BAU_DATA_CONFIG_M_SHFT 16 -#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 -#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 +#define UVH_BAU_DATA_CONFIG_DM_SHFT 8 +#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11 +#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12 +#define UVH_BAU_DATA_CONFIG_P_SHFT 13 +#define UVH_BAU_DATA_CONFIG_T_SHFT 15 +#define UVH_BAU_DATA_CONFIG_M_SHFT 16 +#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32 +#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_bau_data_config_u { - unsigned long v; - struct uvh_bau_data_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_bau_data_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_EVENT_OCCURRED0 */ /* ========================================================================= */ #define UVH_EVENT_OCCURRED0 0x70000UL -#define UVH_EVENT_OCCURRED0_32 0x005e8 +#define UVH_EVENT_OCCURRED0_32 0x5e8 + +#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 +#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 +#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL +#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL + +#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 +#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 +#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3 +#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4 +#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5 +#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6 +#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 +#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 +#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 +#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 +#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 +#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 +#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 +#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 +#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 +#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 +#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 +#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 +#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 +#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 +#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 +#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 +#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 +#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 +#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 +#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43 +#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 +#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 +#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 +#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 +#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51 +#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52 +#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53 +#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54 +#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55 +#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 +#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL +#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL +#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL +#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL +#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL +#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL +#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL +#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL +#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL +#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL +#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL +#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL +#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL +#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL +#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL +#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL +#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL +#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL +#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL +#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL +#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL +#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL +#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL +#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL +#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL +#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL +#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL +#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL +#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL +#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL +#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL +#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL +#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL +#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL + +#define UVXH_EVENT_OCCURRED0_QP_HCERR_SHFT 1 +#define UVXH_EVENT_OCCURRED0_RH_HCERR_SHFT 2 +#define UVXH_EVENT_OCCURRED0_LH0_HCERR_SHFT 3 +#define UVXH_EVENT_OCCURRED0_LH1_HCERR_SHFT 4 +#define UVXH_EVENT_OCCURRED0_GR0_HCERR_SHFT 5 +#define UVXH_EVENT_OCCURRED0_GR1_HCERR_SHFT 6 +#define UVXH_EVENT_OCCURRED0_NI0_HCERR_SHFT 7 +#define UVXH_EVENT_OCCURRED0_NI1_HCERR_SHFT 8 +#define UVXH_EVENT_OCCURRED0_LB_AOERR0_SHFT 9 +#define UVXH_EVENT_OCCURRED0_QP_AOERR0_SHFT 10 +#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12 +#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13 +#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14 +#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15 +#define UVXH_EVENT_OCCURRED0_XB_AOERR0_SHFT 16 +#define UVXH_EVENT_OCCURRED0_RT_AOERR0_SHFT 17 +#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18 +#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19 +#define UVXH_EVENT_OCCURRED0_LB_AOERR1_SHFT 20 +#define UVXH_EVENT_OCCURRED0_QP_AOERR1_SHFT 21 +#define UVXH_EVENT_OCCURRED0_RH_AOERR1_SHFT 22 +#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23 +#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24 +#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25 +#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26 +#define UVXH_EVENT_OCCURRED0_XB_AOERR1_SHFT 27 +#define UVXH_EVENT_OCCURRED0_RT_AOERR1_SHFT 28 +#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29 +#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30 +#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46 +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47 +#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48 +#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49 +#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50 +#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51 +#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52 +#define UVXH_EVENT_OCCURRED0_IPI_INT_SHFT 53 +#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54 +#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55 +#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56 +#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57 +#define UVXH_EVENT_OCCURRED0_PROFILE_INT_SHFT 58 +#define UVXH_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL +#define UVXH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL +#define UVXH_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL +#define UVXH_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL +#define UVXH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL +#define UVXH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL +#define UVXH_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL +#define UVXH_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL +#define UVXH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL +#define UVXH_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL +#define UVXH_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL +#define UVXH_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL +#define UVXH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL +#define UVXH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL +#define UVXH_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL +#define UVXH_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL +#define UVXH_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL +#define UVXH_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL +#define UVXH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL +#define UVXH_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL +#define UVXH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL +#define UVXH_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL +#define UVXH_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL +#define UVXH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL +#define UVXH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL +#define UVXH_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL +#define UVXH_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL +#define UVXH_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL +#define UVXH_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL +#define UVXH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL +#define UVXH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL +#define UVXH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL +#define UVXH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL +#define UVXH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL +#define UVXH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL +#define UVXH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL +#define UVXH_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL +#define UVXH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL +#define UVXH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL +#define UVXH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL +#define UVXH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL +#define UVXH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL -#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 -#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL -#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 -#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL -#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 -#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL -#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3 -#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL -#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4 -#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL -#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5 -#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL -#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6 -#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL -#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 -#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL -#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 -#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL -#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 -#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL -#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 -#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL -#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 -#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL -#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 -#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL -#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 -#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL -#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 -#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL -#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 -#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL -#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 -#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL -#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 -#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL -#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 -#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL -#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 -#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL -#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 -#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL -#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 -#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL -#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 -#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 -#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL -#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 -#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL -#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 -#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL -#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 -#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL -#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 -#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL -#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43 -#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL -#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 -#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL -#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45 -#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL -#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 -#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL -#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 -#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL -#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 -#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL -#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 -#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL -#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 -#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL -#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51 -#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL -#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52 -#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL -#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53 -#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL -#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54 -#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL -#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55 -#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL -#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 -#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL union uvh_event_occurred0_u { - unsigned long v; - struct uvh_event_occurred0_s { - unsigned long lb_hcerr : 1; /* RW, W1C */ - unsigned long gr0_hcerr : 1; /* RW, W1C */ - unsigned long gr1_hcerr : 1; /* RW, W1C */ - unsigned long lh_hcerr : 1; /* RW, W1C */ - unsigned long rh_hcerr : 1; /* RW, W1C */ - unsigned long xn_hcerr : 1; /* RW, W1C */ - unsigned long si_hcerr : 1; /* RW, W1C */ - unsigned long lb_aoerr0 : 1; /* RW, W1C */ - unsigned long gr0_aoerr0 : 1; /* RW, W1C */ - unsigned long gr1_aoerr0 : 1; /* RW, W1C */ - unsigned long lh_aoerr0 : 1; /* RW, W1C */ - unsigned long rh_aoerr0 : 1; /* RW, W1C */ - unsigned long xn_aoerr0 : 1; /* RW, W1C */ - unsigned long si_aoerr0 : 1; /* RW, W1C */ - unsigned long lb_aoerr1 : 1; /* RW, W1C */ - unsigned long gr0_aoerr1 : 1; /* RW, W1C */ - unsigned long gr1_aoerr1 : 1; /* RW, W1C */ - unsigned long lh_aoerr1 : 1; /* RW, W1C */ - unsigned long rh_aoerr1 : 1; /* RW, W1C */ - unsigned long xn_aoerr1 : 1; /* RW, W1C */ - unsigned long si_aoerr1 : 1; /* RW, W1C */ - unsigned long rh_vpi_int : 1; /* RW, W1C */ - unsigned long system_shutdown_int : 1; /* RW, W1C */ - unsigned long lb_irq_int_0 : 1; /* RW, W1C */ - unsigned long lb_irq_int_1 : 1; /* RW, W1C */ - unsigned long lb_irq_int_2 : 1; /* RW, W1C */ - unsigned long lb_irq_int_3 : 1; /* RW, W1C */ - unsigned long lb_irq_int_4 : 1; /* RW, W1C */ - unsigned long lb_irq_int_5 : 1; /* RW, W1C */ - unsigned long lb_irq_int_6 : 1; /* RW, W1C */ - unsigned long lb_irq_int_7 : 1; /* RW, W1C */ - unsigned long lb_irq_int_8 : 1; /* RW, W1C */ - unsigned long lb_irq_int_9 : 1; /* RW, W1C */ - unsigned long lb_irq_int_10 : 1; /* RW, W1C */ - unsigned long lb_irq_int_11 : 1; /* RW, W1C */ - unsigned long lb_irq_int_12 : 1; /* RW, W1C */ - unsigned long lb_irq_int_13 : 1; /* RW, W1C */ - unsigned long lb_irq_int_14 : 1; /* RW, W1C */ - unsigned long lb_irq_int_15 : 1; /* RW, W1C */ - unsigned long l1_nmi_int : 1; /* RW, W1C */ - unsigned long stop_clock : 1; /* RW, W1C */ - unsigned long asic_to_l1 : 1; /* RW, W1C */ - unsigned long l1_to_asic : 1; /* RW, W1C */ - unsigned long ltc_int : 1; /* RW, W1C */ - unsigned long la_seq_trigger : 1; /* RW, W1C */ - unsigned long ipi_int : 1; /* RW, W1C */ - unsigned long extio_int0 : 1; /* RW, W1C */ - unsigned long extio_int1 : 1; /* RW, W1C */ - unsigned long extio_int2 : 1; /* RW, W1C */ - unsigned long extio_int3 : 1; /* RW, W1C */ - unsigned long profile_int : 1; /* RW, W1C */ - unsigned long rtc0 : 1; /* RW, W1C */ - unsigned long rtc1 : 1; /* RW, W1C */ - unsigned long rtc2 : 1; /* RW, W1C */ - unsigned long rtc3 : 1; /* RW, W1C */ - unsigned long bau_data : 1; /* RW, W1C */ - unsigned long power_management_req : 1; /* RW, W1C */ - unsigned long rsvd_57_63 : 7; /* */ - } s; + unsigned long v; + struct uvh_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW, W1C */ + unsigned long rsvd_1_10:10; + unsigned long rh_aoerr0:1; /* RW, W1C */ + unsigned long rsvd_12_63:52; + } s; + struct uvxh_event_occurred0_s { + unsigned long lb_hcerr:1; /* RW */ + unsigned long qp_hcerr:1; /* RW */ + unsigned long rh_hcerr:1; /* RW */ + unsigned long lh0_hcerr:1; /* RW */ + unsigned long lh1_hcerr:1; /* RW */ + unsigned long gr0_hcerr:1; /* RW */ + unsigned long gr1_hcerr:1; /* RW */ + unsigned long ni0_hcerr:1; /* RW */ + unsigned long ni1_hcerr:1; /* RW */ + unsigned long lb_aoerr0:1; /* RW */ + unsigned long qp_aoerr0:1; /* RW */ + unsigned long rh_aoerr0:1; /* RW */ + unsigned long lh0_aoerr0:1; /* RW */ + unsigned long lh1_aoerr0:1; /* RW */ + unsigned long gr0_aoerr0:1; /* RW */ + unsigned long gr1_aoerr0:1; /* RW */ + unsigned long xb_aoerr0:1; /* RW */ + unsigned long rt_aoerr0:1; /* RW */ + unsigned long ni0_aoerr0:1; /* RW */ + unsigned long ni1_aoerr0:1; /* RW */ + unsigned long lb_aoerr1:1; /* RW */ + unsigned long qp_aoerr1:1; /* RW */ + unsigned long rh_aoerr1:1; /* RW */ + unsigned long lh0_aoerr1:1; /* RW */ + unsigned long lh1_aoerr1:1; /* RW */ + unsigned long gr0_aoerr1:1; /* RW */ + unsigned long gr1_aoerr1:1; /* RW */ + unsigned long xb_aoerr1:1; /* RW */ + unsigned long rt_aoerr1:1; /* RW */ + unsigned long ni0_aoerr1:1; /* RW */ + unsigned long ni1_aoerr1:1; /* RW */ + unsigned long system_shutdown_int:1; /* RW */ + unsigned long lb_irq_int_0:1; /* RW */ + unsigned long lb_irq_int_1:1; /* RW */ + unsigned long lb_irq_int_2:1; /* RW */ + unsigned long lb_irq_int_3:1; /* RW */ + unsigned long lb_irq_int_4:1; /* RW */ + unsigned long lb_irq_int_5:1; /* RW */ + unsigned long lb_irq_int_6:1; /* RW */ + unsigned long lb_irq_int_7:1; /* RW */ + unsigned long lb_irq_int_8:1; /* RW */ + unsigned long lb_irq_int_9:1; /* RW */ + unsigned long lb_irq_int_10:1; /* RW */ + unsigned long lb_irq_int_11:1; /* RW */ + unsigned long lb_irq_int_12:1; /* RW */ + unsigned long lb_irq_int_13:1; /* RW */ + unsigned long lb_irq_int_14:1; /* RW */ + unsigned long lb_irq_int_15:1; /* RW */ + unsigned long l1_nmi_int:1; /* RW */ + unsigned long stop_clock:1; /* RW */ + unsigned long asic_to_l1:1; /* RW */ + unsigned long l1_to_asic:1; /* RW */ + unsigned long la_seq_trigger:1; /* RW */ + unsigned long ipi_int:1; /* RW */ + unsigned long extio_int0:1; /* RW */ + unsigned long extio_int1:1; /* RW */ + unsigned long extio_int2:1; /* RW */ + unsigned long extio_int3:1; /* RW */ + unsigned long profile_int:1; /* RW */ + unsigned long rsvd_59_63:5; + } sx; }; /* ========================================================================= */ /* UVH_EVENT_OCCURRED0_ALIAS */ /* ========================================================================= */ -#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL -#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0 +#define UVH_EVENT_OCCURRED0_ALIAS 0x70008UL +#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0 + /* ========================================================================= */ /* UVH_GR0_TLB_INT0_CONFIG */ /* ========================================================================= */ #define UVH_GR0_TLB_INT0_CONFIG 0x61b00UL -#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0 -#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8 -#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12 -#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13 -#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15 -#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16 -#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_SHFT 0 +#define UVH_GR0_TLB_INT0_CONFIG_DM_SHFT 8 +#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVH_GR0_TLB_INT0_CONFIG_STATUS_SHFT 12 +#define UVH_GR0_TLB_INT0_CONFIG_P_SHFT 13 +#define UVH_GR0_TLB_INT0_CONFIG_T_SHFT 15 +#define UVH_GR0_TLB_INT0_CONFIG_M_SHFT 16 +#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVH_GR0_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_GR0_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_GR0_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_GR0_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_GR0_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_GR0_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_GR0_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_GR0_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr0_tlb_int0_config_u { - unsigned long v; - struct uvh_gr0_tlb_int0_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_gr0_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; }; /* ========================================================================= */ @@ -291,37 +503,403 @@ union uvh_gr0_tlb_int0_config_u { /* ========================================================================= */ #define UVH_GR0_TLB_INT1_CONFIG 0x61b40UL -#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0 -#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8 -#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12 -#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13 -#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15 -#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16 -#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_SHFT 0 +#define UVH_GR0_TLB_INT1_CONFIG_DM_SHFT 8 +#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_SHFT 11 +#define UVH_GR0_TLB_INT1_CONFIG_STATUS_SHFT 12 +#define UVH_GR0_TLB_INT1_CONFIG_P_SHFT 13 +#define UVH_GR0_TLB_INT1_CONFIG_T_SHFT 15 +#define UVH_GR0_TLB_INT1_CONFIG_M_SHFT 16 +#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_SHFT 32 +#define UVH_GR0_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_GR0_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_GR0_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_GR0_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_GR0_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_GR0_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_GR0_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_GR0_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr0_tlb_int1_config_u { - unsigned long v; - struct uvh_gr0_tlb_int1_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_gr0_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_GR0_TLB_MMR_CONTROL */ +/* ========================================================================= */ +#define UV1H_GR0_TLB_MMR_CONTROL 0x401080UL +#define UV2H_GR0_TLB_MMR_CONTROL 0xc01080UL +#define UV3H_GR0_TLB_MMR_CONTROL 0xc01080UL +#define UVH_GR0_TLB_MMR_CONTROL \ + (is_uv1_hub() ? UV1H_GR0_TLB_MMR_CONTROL : \ + (is_uv2_hub() ? UV2H_GR0_TLB_MMR_CONTROL : \ + UV3H_GR0_TLB_MMR_CONTROL)) + +#define UVH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UVH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UVH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UVH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UVH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UVH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UVH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL + +#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56 +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60 +#define UV1H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV1H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV1H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL +#define UV1H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL + +#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UVXH_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UVXH_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UVXH_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UVXH_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL + +#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 +#define UV2H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV2H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV2H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL +#define UV2H_GR0_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL + +#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV3H_GR0_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV3H_GR0_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV3H_GR0_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV3H_GR0_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV3H_GR0_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL + +union uvh_gr0_tlb_mmr_control_u { + unsigned long v; + struct uvh_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long rsvd_32_48:17; + unsigned long rsvd_49_51:3; + unsigned long rsvd_52_63:12; + } s; + struct uv1h_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long rsvd_32_47:16; + unsigned long mmr_inj_con:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long mmr_inj_tlbram:1; /* RW */ + unsigned long rsvd_53:1; + unsigned long mmr_inj_tlbpgsize:1; /* RW */ + unsigned long rsvd_55:1; + unsigned long mmr_inj_tlbrreg:1; /* RW */ + unsigned long rsvd_57_59:3; + unsigned long mmr_inj_tlblruv:1; /* RW */ + unsigned long rsvd_61_63:3; + } s1; + struct uvxh_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long rsvd_48:1; + unsigned long rsvd_49_51:3; + unsigned long rsvd_52:1; + unsigned long rsvd_53_63:11; + } sx; + struct uv2h_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long mmr_inj_con:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long mmr_inj_tlbram:1; /* RW */ + unsigned long rsvd_53_63:11; + } s2; + struct uv3h_gr0_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long ecc_sel:1; /* RW */ + unsigned long rsvd_22_29:8; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long undef_48:1; /* Undefined */ + unsigned long rsvd_49_51:3; + unsigned long undef_52:1; /* Undefined */ + unsigned long rsvd_53_63:11; + } s3; +}; + +/* ========================================================================= */ +/* UVH_GR0_TLB_MMR_READ_DATA_HI */ +/* ========================================================================= */ +#define UV1H_GR0_TLB_MMR_READ_DATA_HI 0x4010a0UL +#define UV2H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI 0xc010a0UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI \ + (is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_HI : \ + (is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_HI : \ + UV3H_GR0_TLB_MMR_READ_DATA_HI)) + +#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UVH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UVH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UVH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV1H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UVXH_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV2H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 45 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL + +union uvh_gr0_tlb_mmr_read_data_hi_u { + unsigned long v; + struct uvh_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s; + struct uv1h_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s1; + struct uvxh_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } sx; + struct uv2h_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s2; + struct uv3h_gr0_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long aa_ext:1; /* RO */ + unsigned long undef_46_54:9; /* Undefined */ + unsigned long way_ecc:9; /* RO */ + } s3; +}; + +/* ========================================================================= */ +/* UVH_GR0_TLB_MMR_READ_DATA_LO */ +/* ========================================================================= */ +#define UV1H_GR0_TLB_MMR_READ_DATA_LO 0x4010a8UL +#define UV2H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL +#define UV3H_GR0_TLB_MMR_READ_DATA_LO 0xc010a8UL +#define UVH_GR0_TLB_MMR_READ_DATA_LO \ + (is_uv1_hub() ? UV1H_GR0_TLB_MMR_READ_DATA_LO : \ + (is_uv2_hub() ? UV2H_GR0_TLB_MMR_READ_DATA_LO : \ + UV3H_GR0_TLB_MMR_READ_DATA_LO)) + +#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UVH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UVH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UVH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV1H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UVXH_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV2H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV3H_GR0_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +union uvh_gr0_tlb_mmr_read_data_lo_u { + unsigned long v; + struct uvh_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s; + struct uv1h_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s1; + struct uvxh_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } sx; + struct uv2h_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s2; + struct uv3h_gr0_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s3; }; /* ========================================================================= */ @@ -329,37 +907,37 @@ union uvh_gr0_tlb_int1_config_u { /* ========================================================================= */ #define UVH_GR1_TLB_INT0_CONFIG 0x61f00UL -#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 -#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 -#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12 -#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13 -#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15 -#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16 -#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_SHFT 0 +#define UVH_GR1_TLB_INT0_CONFIG_DM_SHFT 8 +#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_SHFT 11 +#define UVH_GR1_TLB_INT0_CONFIG_STATUS_SHFT 12 +#define UVH_GR1_TLB_INT0_CONFIG_P_SHFT 13 +#define UVH_GR1_TLB_INT0_CONFIG_T_SHFT 15 +#define UVH_GR1_TLB_INT0_CONFIG_M_SHFT 16 +#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_SHFT 32 +#define UVH_GR1_TLB_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_GR1_TLB_INT0_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_GR1_TLB_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_GR1_TLB_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_GR1_TLB_INT0_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_GR1_TLB_INT0_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_GR1_TLB_INT0_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_GR1_TLB_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr1_tlb_int0_config_u { - unsigned long v; - struct uvh_gr1_tlb_int0_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_gr1_tlb_int0_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; }; /* ========================================================================= */ @@ -367,37 +945,403 @@ union uvh_gr1_tlb_int0_config_u { /* ========================================================================= */ #define UVH_GR1_TLB_INT1_CONFIG 0x61f40UL -#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 -#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 -#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11 -#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12 -#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13 -#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15 -#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16 -#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32 -#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_SHFT 0 +#define UVH_GR1_TLB_INT1_CONFIG_DM_SHFT 8 +#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_SHFT 11 +#define UVH_GR1_TLB_INT1_CONFIG_STATUS_SHFT 12 +#define UVH_GR1_TLB_INT1_CONFIG_P_SHFT 13 +#define UVH_GR1_TLB_INT1_CONFIG_T_SHFT 15 +#define UVH_GR1_TLB_INT1_CONFIG_M_SHFT 16 +#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_SHFT 32 +#define UVH_GR1_TLB_INT1_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_GR1_TLB_INT1_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_GR1_TLB_INT1_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_GR1_TLB_INT1_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_GR1_TLB_INT1_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_GR1_TLB_INT1_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_GR1_TLB_INT1_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_GR1_TLB_INT1_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_gr1_tlb_int1_config_u { - unsigned long v; - struct uvh_gr1_tlb_int1_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; + unsigned long v; + struct uvh_gr1_tlb_int1_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_GR1_TLB_MMR_CONTROL */ +/* ========================================================================= */ +#define UV1H_GR1_TLB_MMR_CONTROL 0x801080UL +#define UV2H_GR1_TLB_MMR_CONTROL 0x1001080UL +#define UV3H_GR1_TLB_MMR_CONTROL 0x1001080UL +#define UVH_GR1_TLB_MMR_CONTROL \ + (is_uv1_hub() ? UV1H_GR1_TLB_MMR_CONTROL : \ + (is_uv2_hub() ? UV2H_GR1_TLB_MMR_CONTROL : \ + UV3H_GR1_TLB_MMR_CONTROL)) + +#define UVH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UVH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UVH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UVH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UVH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UVH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UVH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL + +#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_SHFT 54 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_SHFT 56 +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_SHFT 60 +#define UV1H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV1H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV1H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBPGSIZE_MASK 0x0040000000000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRREG_MASK 0x0100000000000000UL +#define UV1H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBLRUV_MASK 0x1000000000000000UL + +#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UVXH_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UVXH_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UVXH_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UVXH_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL + +#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_SHFT 48 +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_SHFT 52 +#define UV2H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV2H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV2H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_CON_MASK 0x0001000000000000UL +#define UV2H_GR1_TLB_MMR_CONTROL_MMR_INJ_TLBRAM_MASK 0x0010000000000000UL + +#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_SHFT 0 +#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_SHFT 12 +#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_SHFT 16 +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_SHFT 20 +#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_SHFT 21 +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_SHFT 30 +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_SHFT 31 +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_SHFT 32 +#define UV3H_GR1_TLB_MMR_CONTROL_INDEX_MASK 0x0000000000000fffUL +#define UV3H_GR1_TLB_MMR_CONTROL_MEM_SEL_MASK 0x0000000000003000UL +#define UV3H_GR1_TLB_MMR_CONTROL_AUTO_VALID_EN_MASK 0x0000000000010000UL +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_HASH_INDEX_EN_MASK 0x0000000000100000UL +#define UV3H_GR1_TLB_MMR_CONTROL_ECC_SEL_MASK 0x0000000000200000UL +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_WRITE_MASK 0x0000000040000000UL +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_READ_MASK 0x0000000080000000UL +#define UV3H_GR1_TLB_MMR_CONTROL_MMR_OP_DONE_MASK 0x0000000100000000UL + +union uvh_gr1_tlb_mmr_control_u { + unsigned long v; + struct uvh_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long rsvd_32_48:17; + unsigned long rsvd_49_51:3; + unsigned long rsvd_52_63:12; + } s; + struct uv1h_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long rsvd_32_47:16; + unsigned long mmr_inj_con:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long mmr_inj_tlbram:1; /* RW */ + unsigned long rsvd_53:1; + unsigned long mmr_inj_tlbpgsize:1; /* RW */ + unsigned long rsvd_55:1; + unsigned long mmr_inj_tlbrreg:1; /* RW */ + unsigned long rsvd_57_59:3; + unsigned long mmr_inj_tlblruv:1; /* RW */ + unsigned long rsvd_61_63:3; + } s1; + struct uvxh_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long rsvd_48:1; + unsigned long rsvd_49_51:3; + unsigned long rsvd_52:1; + unsigned long rsvd_53_63:11; + } sx; + struct uv2h_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long rsvd_21_29:9; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long mmr_inj_con:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long mmr_inj_tlbram:1; /* RW */ + unsigned long rsvd_53_63:11; + } s2; + struct uv3h_gr1_tlb_mmr_control_s { + unsigned long index:12; /* RW */ + unsigned long mem_sel:2; /* RW */ + unsigned long rsvd_14_15:2; + unsigned long auto_valid_en:1; /* RW */ + unsigned long rsvd_17_19:3; + unsigned long mmr_hash_index_en:1; /* RW */ + unsigned long ecc_sel:1; /* RW */ + unsigned long rsvd_22_29:8; + unsigned long mmr_write:1; /* WP */ + unsigned long mmr_read:1; /* WP */ + unsigned long mmr_op_done:1; /* RW */ + unsigned long rsvd_33_47:15; + unsigned long undef_48:1; /* Undefined */ + unsigned long rsvd_49_51:3; + unsigned long undef_52:1; /* Undefined */ + unsigned long rsvd_53_63:11; + } s3; +}; + +/* ========================================================================= */ +/* UVH_GR1_TLB_MMR_READ_DATA_HI */ +/* ========================================================================= */ +#define UV1H_GR1_TLB_MMR_READ_DATA_HI 0x8010a0UL +#define UV2H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI 0x10010a0UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI \ + (is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_HI : \ + (is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_HI : \ + UV3H_GR1_TLB_MMR_READ_DATA_HI)) + +#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UVH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UVH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UVH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV1H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UVXH_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV2H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL + +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_SHFT 0 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_SHFT 41 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_SHFT 43 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_SHFT 44 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_SHFT 45 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_SHFT 55 +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_PFN_MASK 0x000001ffffffffffUL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_GAA_MASK 0x0000060000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_DIRTY_MASK 0x0000080000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_LARGER_MASK 0x0000100000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_AA_EXT_MASK 0x0000200000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_HI_WAY_ECC_MASK 0xff80000000000000UL + +union uvh_gr1_tlb_mmr_read_data_hi_u { + unsigned long v; + struct uvh_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s; + struct uv1h_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s1; + struct uvxh_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } sx; + struct uv2h_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long rsvd_45_63:19; + } s2; + struct uv3h_gr1_tlb_mmr_read_data_hi_s { + unsigned long pfn:41; /* RO */ + unsigned long gaa:2; /* RO */ + unsigned long dirty:1; /* RO */ + unsigned long larger:1; /* RO */ + unsigned long aa_ext:1; /* RO */ + unsigned long undef_46_54:9; /* Undefined */ + unsigned long way_ecc:9; /* RO */ + } s3; +}; + +/* ========================================================================= */ +/* UVH_GR1_TLB_MMR_READ_DATA_LO */ +/* ========================================================================= */ +#define UV1H_GR1_TLB_MMR_READ_DATA_LO 0x8010a8UL +#define UV2H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL +#define UV3H_GR1_TLB_MMR_READ_DATA_LO 0x10010a8UL +#define UVH_GR1_TLB_MMR_READ_DATA_LO \ + (is_uv1_hub() ? UV1H_GR1_TLB_MMR_READ_DATA_LO : \ + (is_uv2_hub() ? UV2H_GR1_TLB_MMR_READ_DATA_LO : \ + UV3H_GR1_TLB_MMR_READ_DATA_LO)) + +#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UVH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UVH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UVH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV1H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UVXH_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV2H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_SHFT 0 +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_SHFT 39 +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_SHFT 63 +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VPN_MASK 0x0000007fffffffffUL +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_ASID_MASK 0x7fffff8000000000UL +#define UV3H_GR1_TLB_MMR_READ_DATA_LO_VALID_MASK 0x8000000000000000UL + +union uvh_gr1_tlb_mmr_read_data_lo_u { + unsigned long v; + struct uvh_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s; + struct uv1h_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s1; + struct uvxh_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } sx; + struct uv2h_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s2; + struct uv3h_gr1_tlb_mmr_read_data_lo_s { + unsigned long vpn:39; /* RO */ + unsigned long asid:24; /* RO */ + unsigned long valid:1; /* RO */ + } s3; }; /* ========================================================================= */ @@ -405,15 +1349,15 @@ union uvh_gr1_tlb_int1_config_u { /* ========================================================================= */ #define UVH_INT_CMPB 0x22080UL -#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 -#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL +#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0 +#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL union uvh_int_cmpb_u { - unsigned long v; - struct uvh_int_cmpb_s { - unsigned long real_time_cmpb : 56; /* RW */ - unsigned long rsvd_56_63 : 8; /* */ - } s; + unsigned long v; + struct uvh_int_cmpb_s { + unsigned long real_time_cmpb:56; /* RW */ + unsigned long rsvd_56_63:8; + } s; }; /* ========================================================================= */ @@ -421,15 +1365,18 @@ union uvh_int_cmpb_u { /* ========================================================================= */ #define UVH_INT_CMPC 0x22100UL -#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0 -#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL +#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0 +#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL + +#define UVXH_INT_CMPC_REAL_TIME_CMP_2_SHFT 0 +#define UVXH_INT_CMPC_REAL_TIME_CMP_2_MASK 0x00ffffffffffffffUL union uvh_int_cmpc_u { - unsigned long v; - struct uvh_int_cmpc_s { - unsigned long real_time_cmpc : 56; /* RW */ - unsigned long rsvd_56_63 : 8; /* */ - } s; + unsigned long v; + struct uvh_int_cmpc_s { + unsigned long real_time_cmpc:56; /* RW */ + unsigned long rsvd_56_63:8; + } s; }; /* ========================================================================= */ @@ -437,611 +1384,742 @@ union uvh_int_cmpc_u { /* ========================================================================= */ #define UVH_INT_CMPD 0x22180UL -#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0 -#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL +#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0 +#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL + +#define UVXH_INT_CMPD_REAL_TIME_CMP_3_SHFT 0 +#define UVXH_INT_CMPD_REAL_TIME_CMP_3_MASK 0x00ffffffffffffffUL union uvh_int_cmpd_u { - unsigned long v; - struct uvh_int_cmpd_s { - unsigned long real_time_cmpd : 56; /* RW */ - unsigned long rsvd_56_63 : 8; /* */ - } s; + unsigned long v; + struct uvh_int_cmpd_s { + unsigned long real_time_cmpd:56; /* RW */ + unsigned long rsvd_56_63:8; + } s; }; /* ========================================================================= */ /* UVH_IPI_INT */ /* ========================================================================= */ #define UVH_IPI_INT 0x60500UL -#define UVH_IPI_INT_32 0x0348 +#define UVH_IPI_INT_32 0x348 -#define UVH_IPI_INT_VECTOR_SHFT 0 -#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL -#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 -#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL -#define UVH_IPI_INT_DESTMODE_SHFT 11 -#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL -#define UVH_IPI_INT_APIC_ID_SHFT 16 -#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL -#define UVH_IPI_INT_SEND_SHFT 63 -#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL +#define UVH_IPI_INT_VECTOR_SHFT 0 +#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8 +#define UVH_IPI_INT_DESTMODE_SHFT 11 +#define UVH_IPI_INT_APIC_ID_SHFT 16 +#define UVH_IPI_INT_SEND_SHFT 63 +#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL +#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL +#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL +#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL +#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL union uvh_ipi_int_u { - unsigned long v; - struct uvh_ipi_int_s { - unsigned long vector_ : 8; /* RW */ - unsigned long delivery_mode : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long rsvd_12_15 : 4; /* */ - unsigned long apic_id : 32; /* RW */ - unsigned long rsvd_48_62 : 15; /* */ - unsigned long send : 1; /* WP */ - } s; + unsigned long v; + struct uvh_ipi_int_s { + unsigned long vector_:8; /* RW */ + unsigned long delivery_mode:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long rsvd_12_15:4; + unsigned long apic_id:32; /* RW */ + unsigned long rsvd_48_62:15; + unsigned long send:1; /* WP */ + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0 #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL union uvh_lb_bau_intd_payload_queue_first_u { - unsigned long v; - struct uvh_lb_bau_intd_payload_queue_first_s { - unsigned long rsvd_0_3: 4; /* */ - unsigned long address : 39; /* RW */ - unsigned long rsvd_43_48: 6; /* */ - unsigned long node_id : 14; /* RW */ - unsigned long rsvd_63 : 1; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_first_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_48:6; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL union uvh_lb_bau_intd_payload_queue_last_u { - unsigned long v; - struct uvh_lb_bau_intd_payload_queue_last_s { - unsigned long rsvd_0_3: 4; /* */ - unsigned long address : 39; /* RW */ - unsigned long rsvd_43_63: 21; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_last_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 -#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 +#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL union uvh_lb_bau_intd_payload_queue_tail_u { - unsigned long v; - struct uvh_lb_bau_intd_payload_queue_tail_s { - unsigned long rsvd_0_3: 4; /* */ - unsigned long address : 39; /* RW */ - unsigned long rsvd_43_63: 21; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_intd_payload_queue_tail_s { + unsigned long rsvd_0_3:4; + unsigned long address:39; /* RW */ + unsigned long rsvd_43_63:21; + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ /* ========================================================================= */ #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68 #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14 -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL #define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL + union uvh_lb_bau_intd_software_acknowledge_u { - unsigned long v; - struct uvh_lb_bau_intd_software_acknowledge_s { - unsigned long pending_0 : 1; /* RW, W1C */ - unsigned long pending_1 : 1; /* RW, W1C */ - unsigned long pending_2 : 1; /* RW, W1C */ - unsigned long pending_3 : 1; /* RW, W1C */ - unsigned long pending_4 : 1; /* RW, W1C */ - unsigned long pending_5 : 1; /* RW, W1C */ - unsigned long pending_6 : 1; /* RW, W1C */ - unsigned long pending_7 : 1; /* RW, W1C */ - unsigned long timeout_0 : 1; /* RW, W1C */ - unsigned long timeout_1 : 1; /* RW, W1C */ - unsigned long timeout_2 : 1; /* RW, W1C */ - unsigned long timeout_3 : 1; /* RW, W1C */ - unsigned long timeout_4 : 1; /* RW, W1C */ - unsigned long timeout_5 : 1; /* RW, W1C */ - unsigned long timeout_6 : 1; /* RW, W1C */ - unsigned long timeout_7 : 1; /* RW, W1C */ - unsigned long rsvd_16_63: 48; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_intd_software_acknowledge_s { + unsigned long pending_0:1; /* RW, W1C */ + unsigned long pending_1:1; /* RW, W1C */ + unsigned long pending_2:1; /* RW, W1C */ + unsigned long pending_3:1; /* RW, W1C */ + unsigned long pending_4:1; /* RW, W1C */ + unsigned long pending_5:1; /* RW, W1C */ + unsigned long pending_6:1; /* RW, W1C */ + unsigned long pending_7:1; /* RW, W1C */ + unsigned long timeout_0:1; /* RW, W1C */ + unsigned long timeout_1:1; /* RW, W1C */ + unsigned long timeout_2:1; /* RW, W1C */ + unsigned long timeout_3:1; /* RW, W1C */ + unsigned long timeout_4:1; /* RW, W1C */ + unsigned long timeout_5:1; /* RW, W1C */ + unsigned long timeout_6:1; /* RW, W1C */ + unsigned long timeout_7:1; /* RW, W1C */ + unsigned long rsvd_16_63:48; + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ /* ========================================================================= */ -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL -#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70 +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x320088UL +#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70 + + +/* ========================================================================= */ +/* UVH_LB_BAU_MISC_CONTROL */ +/* ========================================================================= */ +#define UVH_LB_BAU_MISC_CONTROL 0x320170UL +#define UV1H_LB_BAU_MISC_CONTROL 0x320170UL +#define UV2H_LB_BAU_MISC_CONTROL 0x320170UL +#define UV3H_LB_BAU_MISC_CONTROL 0x320170UL +#define UVH_LB_BAU_MISC_CONTROL_32 0xa10 +#define UV1H_LB_BAU_MISC_CONTROL_32 0x320170UL +#define UV2H_LB_BAU_MISC_CONTROL_32 0x320170UL +#define UV3H_LB_BAU_MISC_CONTROL_32 0x320170UL + +#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UVH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UVH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UVH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UVH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 +#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 +#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UVXH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UVXH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UVXH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UVXH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UVXH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UVXH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UVXH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UVXH_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UVXH_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UVXH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UVXH_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UVXH_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UVXH_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL +#define UVXH_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL +#define UVXH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 +#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL +#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL +#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 +#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8 +#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9 +#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 +#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11 +#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 +#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20 +#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21 +#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22 +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23 +#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24 +#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29 +#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30 +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33 +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34 +#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35 +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_SHFT 36 +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_SHFT 37 +#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_SHFT 38 +#define UV3H_LB_BAU_MISC_CONTROL_FUN_SHFT 48 +#define UV3H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL +#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL +#define UV3H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL +#define UV3H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL +#define UV3H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL +#define UV3H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL +#define UV3H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL +#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL +#define UV3H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL +#define UV3H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL +#define UV3H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL +#define UV3H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL +#define UV3H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL +#define UV3H_LB_BAU_MISC_CONTROL_SUPPRESS_QUIESCE_MSGS_TO_QPI_MASK 0x0000001000000000UL +#define UV3H_LB_BAU_MISC_CONTROL_ENABLE_INTD_PREFETCH_HINT_MASK 0x0000002000000000UL +#define UV3H_LB_BAU_MISC_CONTROL_THREAD_KILL_TIMEBASE_MASK 0x00003fc000000000UL +#define UV3H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL + +union uvh_lb_bau_misc_control_u { + unsigned long v; + struct uvh_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long rsvd_29_47:19; + unsigned long fun:16; /* RW */ + } s; + struct uv1h_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long rsvd_29_47:19; + unsigned long fun:16; /* RW */ + } s1; + struct uvxh_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long enable_automatic_apic_mode_selection:1;/* RW */ + unsigned long apic_mode_status:1; /* RO */ + unsigned long suppress_interrupts_to_self:1; /* RW */ + unsigned long enable_lock_based_system_flush:1;/* RW */ + unsigned long enable_extended_sb_status:1; /* RW */ + unsigned long suppress_int_prio_udt_to_self:1;/* RW */ + unsigned long use_legacy_descriptor_formats:1;/* RW */ + unsigned long rsvd_36_47:12; + unsigned long fun:16; /* RW */ + } sx; + struct uv2h_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long enable_automatic_apic_mode_selection:1;/* RW */ + unsigned long apic_mode_status:1; /* RO */ + unsigned long suppress_interrupts_to_self:1; /* RW */ + unsigned long enable_lock_based_system_flush:1;/* RW */ + unsigned long enable_extended_sb_status:1; /* RW */ + unsigned long suppress_int_prio_udt_to_self:1;/* RW */ + unsigned long use_legacy_descriptor_formats:1;/* RW */ + unsigned long rsvd_36_47:12; + unsigned long fun:16; /* RW */ + } s2; + struct uv3h_lb_bau_misc_control_s { + unsigned long rejection_delay:8; /* RW */ + unsigned long apic_mode:1; /* RW */ + unsigned long force_broadcast:1; /* RW */ + unsigned long force_lock_nop:1; /* RW */ + unsigned long qpi_agent_presence_vector:3; /* RW */ + unsigned long descriptor_fetch_mode:1; /* RW */ + unsigned long enable_intd_soft_ack_mode:1; /* RW */ + unsigned long intd_soft_ack_timeout_period:4; /* RW */ + unsigned long enable_dual_mapping_mode:1; /* RW */ + unsigned long vga_io_port_decode_enable:1; /* RW */ + unsigned long vga_io_port_16_bit_decode:1; /* RW */ + unsigned long suppress_dest_registration:1; /* RW */ + unsigned long programmed_initial_priority:3; /* RW */ + unsigned long use_incoming_priority:1; /* RW */ + unsigned long enable_programmed_initial_priority:1;/* RW */ + unsigned long enable_automatic_apic_mode_selection:1;/* RW */ + unsigned long apic_mode_status:1; /* RO */ + unsigned long suppress_interrupts_to_self:1; /* RW */ + unsigned long enable_lock_based_system_flush:1;/* RW */ + unsigned long enable_extended_sb_status:1; /* RW */ + unsigned long suppress_int_prio_udt_to_self:1;/* RW */ + unsigned long use_legacy_descriptor_formats:1;/* RW */ + unsigned long suppress_quiesce_msgs_to_qpi:1; /* RW */ + unsigned long enable_intd_prefetch_hint:1; /* RW */ + unsigned long thread_kill_timebase:8; /* RW */ + unsigned long rsvd_46_47:2; + unsigned long fun:16; /* RW */ + } s3; +}; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63 -#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63 +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL +#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL union uvh_lb_bau_sb_activation_control_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_control_s { - unsigned long index : 6; /* RW */ - unsigned long rsvd_6_61: 56; /* */ - unsigned long push : 1; /* WP */ - unsigned long init : 1; /* WP */ - } s; + unsigned long v; + struct uvh_lb_bau_sb_activation_control_s { + unsigned long index:6; /* RW */ + unsigned long rsvd_6_61:56; + unsigned long push:1; /* WP */ + unsigned long init:1; /* WP */ + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL union uvh_lb_bau_sb_activation_status_0_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_status_0_s { - unsigned long status : 64; /* RW */ - } s; + unsigned long v; + struct uvh_lb_bau_sb_activation_status_0_s { + unsigned long status:64; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ /* ========================================================================= */ #define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 -#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 +#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL union uvh_lb_bau_sb_activation_status_1_u { - unsigned long v; - struct uvh_lb_bau_sb_activation_status_1_s { - unsigned long status : 64; /* RW */ - } s; + unsigned long v; + struct uvh_lb_bau_sb_activation_status_1_s { + unsigned long status:64; /* RW */ + } s; }; /* ========================================================================= */ /* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ /* ========================================================================= */ #define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 -#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49 +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL +#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL union uvh_lb_bau_sb_descriptor_base_u { - unsigned long v; - struct uvh_lb_bau_sb_descriptor_base_s { - unsigned long rsvd_0_11 : 12; /* */ - unsigned long page_address : 31; /* RW */ - unsigned long rsvd_43_48 : 6; /* */ - unsigned long node_id : 14; /* RW */ - unsigned long rsvd_63 : 1; /* */ - } s; -}; - -/* ========================================================================= */ -/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */ -/* ========================================================================= */ -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL - -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42 -#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL - -union uvh_lb_mcast_aoerr0_rpt_enable_u { - unsigned long v; - struct uvh_lb_mcast_aoerr0_rpt_enable_s { - unsigned long mcast_obese_msg : 1; /* RW */ - unsigned long mcast_data_sb_err : 1; /* RW */ - unsigned long mcast_nack_buff_parity : 1; /* RW */ - unsigned long mcast_timeout : 1; /* RW */ - unsigned long mcast_inactive_reply : 1; /* RW */ - unsigned long mcast_upgrade_error : 1; /* RW */ - unsigned long mcast_reg_count_underflow : 1; /* RW */ - unsigned long mcast_rep_obese_msg : 1; /* RW */ - unsigned long ucache_req_runt_msg : 1; /* RW */ - unsigned long ucache_req_obese_msg : 1; /* RW */ - unsigned long ucache_req_data_sb_err : 1; /* RW */ - unsigned long ucache_rep_runt_msg : 1; /* RW */ - unsigned long ucache_rep_obese_msg : 1; /* RW */ - unsigned long ucache_rep_data_sb_err : 1; /* RW */ - unsigned long ucache_rep_command_err : 1; /* RW */ - unsigned long ucache_pend_timeout : 1; /* RW */ - unsigned long macc_req_runt_msg : 1; /* RW */ - unsigned long macc_req_obese_msg : 1; /* RW */ - unsigned long macc_req_data_sb_err : 1; /* RW */ - unsigned long macc_rep_runt_msg : 1; /* RW */ - unsigned long macc_rep_obese_msg : 1; /* RW */ - unsigned long macc_rep_data_sb_err : 1; /* RW */ - unsigned long macc_amo_timeout : 1; /* RW */ - unsigned long macc_put_timeout : 1; /* RW */ - unsigned long macc_spurious_event : 1; /* RW */ - unsigned long ioh_destination_table_parity : 1; /* RW */ - unsigned long get_had_error_reply : 1; /* RW */ - unsigned long get_timeout : 1; /* RW */ - unsigned long lock_manager_had_error_reply : 1; /* RW */ - unsigned long put_had_error_reply : 1; /* RW */ - unsigned long put_timeout : 1; /* RW */ - unsigned long sb_activation_overrun : 1; /* RW */ - unsigned long completed_gb_activation_had_error_reply : 1; /* RW */ - unsigned long completed_gb_activation_timeout : 1; /* RW */ - unsigned long descriptor_buffer_0_parity : 1; /* RW */ - unsigned long descriptor_buffer_1_parity : 1; /* RW */ - unsigned long socket_destination_table_parity : 1; /* RW */ - unsigned long bau_reply_payload_corruption : 1; /* RW */ - unsigned long io_port_destination_table_parity : 1; /* RW */ - unsigned long intd_soft_ack_timeout : 1; /* RW */ - unsigned long int_rep_obese_msg : 1; /* RW */ - unsigned long int_rep_command_err : 1; /* RW */ - unsigned long int_timeout : 1; /* RW */ - unsigned long rsvd_43_63 : 21; /* */ - } s; -}; - -/* ========================================================================= */ -/* UVH_LOCAL_INT0_CONFIG */ -/* ========================================================================= */ -#define UVH_LOCAL_INT0_CONFIG 0x61000UL - -#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0 -#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8 -#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11 -#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12 -#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13 -#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15 -#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16 -#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32 -#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL - -union uvh_local_int0_config_u { - unsigned long v; - struct uvh_local_int0_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_LOCAL_INT0_ENABLE */ -/* ========================================================================= */ -#define UVH_LOCAL_INT0_ENABLE 0x65000UL - -#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0 -#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL -#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1 -#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL -#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2 -#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL -#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3 -#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL -#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4 -#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL -#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5 -#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL -#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6 -#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL -#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7 -#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL -#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8 -#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL -#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9 -#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL -#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10 -#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL -#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11 -#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL -#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12 -#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL -#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13 -#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL -#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14 -#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL -#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15 -#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL -#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16 -#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL -#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17 -#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL -#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18 -#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL -#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19 -#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL -#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20 -#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL -#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21 -#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL -#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22 -#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38 -#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL -#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39 -#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL -#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40 -#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL -#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41 -#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL -#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42 -#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL -#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43 -#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL -#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44 -#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL - -union uvh_local_int0_enable_u { - unsigned long v; - struct uvh_local_int0_enable_s { - unsigned long lb_hcerr : 1; /* RW */ - unsigned long gr0_hcerr : 1; /* RW */ - unsigned long gr1_hcerr : 1; /* RW */ - unsigned long lh_hcerr : 1; /* RW */ - unsigned long rh_hcerr : 1; /* RW */ - unsigned long xn_hcerr : 1; /* RW */ - unsigned long si_hcerr : 1; /* RW */ - unsigned long lb_aoerr0 : 1; /* RW */ - unsigned long gr0_aoerr0 : 1; /* RW */ - unsigned long gr1_aoerr0 : 1; /* RW */ - unsigned long lh_aoerr0 : 1; /* RW */ - unsigned long rh_aoerr0 : 1; /* RW */ - unsigned long xn_aoerr0 : 1; /* RW */ - unsigned long si_aoerr0 : 1; /* RW */ - unsigned long lb_aoerr1 : 1; /* RW */ - unsigned long gr0_aoerr1 : 1; /* RW */ - unsigned long gr1_aoerr1 : 1; /* RW */ - unsigned long lh_aoerr1 : 1; /* RW */ - unsigned long rh_aoerr1 : 1; /* RW */ - unsigned long xn_aoerr1 : 1; /* RW */ - unsigned long si_aoerr1 : 1; /* RW */ - unsigned long rh_vpi_int : 1; /* RW */ - unsigned long system_shutdown_int : 1; /* RW */ - unsigned long lb_irq_int_0 : 1; /* RW */ - unsigned long lb_irq_int_1 : 1; /* RW */ - unsigned long lb_irq_int_2 : 1; /* RW */ - unsigned long lb_irq_int_3 : 1; /* RW */ - unsigned long lb_irq_int_4 : 1; /* RW */ - unsigned long lb_irq_int_5 : 1; /* RW */ - unsigned long lb_irq_int_6 : 1; /* RW */ - unsigned long lb_irq_int_7 : 1; /* RW */ - unsigned long lb_irq_int_8 : 1; /* RW */ - unsigned long lb_irq_int_9 : 1; /* RW */ - unsigned long lb_irq_int_10 : 1; /* RW */ - unsigned long lb_irq_int_11 : 1; /* RW */ - unsigned long lb_irq_int_12 : 1; /* RW */ - unsigned long lb_irq_int_13 : 1; /* RW */ - unsigned long lb_irq_int_14 : 1; /* RW */ - unsigned long lb_irq_int_15 : 1; /* RW */ - unsigned long l1_nmi_int : 1; /* RW */ - unsigned long stop_clock : 1; /* RW */ - unsigned long asic_to_l1 : 1; /* RW */ - unsigned long l1_to_asic : 1; /* RW */ - unsigned long ltc_int : 1; /* RW */ - unsigned long la_seq_trigger : 1; /* RW */ - unsigned long rsvd_45_63 : 19; /* */ - } s; + unsigned long v; + struct uvh_lb_bau_sb_descriptor_base_s { + unsigned long rsvd_0_11:12; + unsigned long page_address:31; /* RW */ + unsigned long rsvd_43_48:6; + unsigned long node_id:14; /* RW */ + unsigned long rsvd_63:1; + } s; }; /* ========================================================================= */ /* UVH_NODE_ID */ /* ========================================================================= */ #define UVH_NODE_ID 0x0UL - -#define UVH_NODE_ID_FORCE1_SHFT 0 -#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL -#define UVH_NODE_ID_MANUFACTURER_SHFT 1 -#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL -#define UVH_NODE_ID_PART_NUMBER_SHFT 12 -#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL -#define UVH_NODE_ID_REVISION_SHFT 28 -#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL -#define UVH_NODE_ID_NODE_ID_SHFT 32 -#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL -#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48 -#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL -#define UVH_NODE_ID_NI_PORT_SHFT 56 -#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL +#define UV1H_NODE_ID 0x0UL +#define UV2H_NODE_ID 0x0UL +#define UV3H_NODE_ID 0x0UL + +#define UVH_NODE_ID_FORCE1_SHFT 0 +#define UVH_NODE_ID_MANUFACTURER_SHFT 1 +#define UVH_NODE_ID_PART_NUMBER_SHFT 12 +#define UVH_NODE_ID_REVISION_SHFT 28 +#define UVH_NODE_ID_NODE_ID_SHFT 32 +#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL + +#define UV1H_NODE_ID_FORCE1_SHFT 0 +#define UV1H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV1H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV1H_NODE_ID_REVISION_SHFT 28 +#define UV1H_NODE_ID_NODE_ID_SHFT 32 +#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48 +#define UV1H_NODE_ID_NI_PORT_SHFT 56 +#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL +#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL + +#define UVXH_NODE_ID_FORCE1_SHFT 0 +#define UVXH_NODE_ID_MANUFACTURER_SHFT 1 +#define UVXH_NODE_ID_PART_NUMBER_SHFT 12 +#define UVXH_NODE_ID_REVISION_SHFT 28 +#define UVXH_NODE_ID_NODE_ID_SHFT 32 +#define UVXH_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UVXH_NODE_ID_NI_PORT_SHFT 57 +#define UVXH_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UVXH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UVXH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UVXH_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UVXH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UVXH_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UVXH_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL + +#define UV2H_NODE_ID_FORCE1_SHFT 0 +#define UV2H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV2H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV2H_NODE_ID_REVISION_SHFT 28 +#define UV2H_NODE_ID_NODE_ID_SHFT 32 +#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UV2H_NODE_ID_NI_PORT_SHFT 57 +#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL + +#define UV3H_NODE_ID_FORCE1_SHFT 0 +#define UV3H_NODE_ID_MANUFACTURER_SHFT 1 +#define UV3H_NODE_ID_PART_NUMBER_SHFT 12 +#define UV3H_NODE_ID_REVISION_SHFT 28 +#define UV3H_NODE_ID_NODE_ID_SHFT 32 +#define UV3H_NODE_ID_ROUTER_SELECT_SHFT 48 +#define UV3H_NODE_ID_RESERVED_2_SHFT 49 +#define UV3H_NODE_ID_NODES_PER_BIT_SHFT 50 +#define UV3H_NODE_ID_NI_PORT_SHFT 57 +#define UV3H_NODE_ID_FORCE1_MASK 0x0000000000000001UL +#define UV3H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL +#define UV3H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL +#define UV3H_NODE_ID_REVISION_MASK 0x00000000f0000000UL +#define UV3H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL +#define UV3H_NODE_ID_ROUTER_SELECT_MASK 0x0001000000000000UL +#define UV3H_NODE_ID_RESERVED_2_MASK 0x0002000000000000UL +#define UV3H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL +#define UV3H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL union uvh_node_id_u { - unsigned long v; - struct uvh_node_id_s { - unsigned long force1 : 1; /* RO */ - unsigned long manufacturer : 11; /* RO */ - unsigned long part_number : 16; /* RO */ - unsigned long revision : 4; /* RO */ - unsigned long node_id : 15; /* RW */ - unsigned long rsvd_47 : 1; /* */ - unsigned long nodes_per_bit : 7; /* RW */ - unsigned long rsvd_55 : 1; /* */ - unsigned long ni_port : 4; /* RO */ - unsigned long rsvd_60_63 : 4; /* */ - } s; + unsigned long v; + struct uvh_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47_63:17; + } s; + struct uv1h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47:1; + unsigned long nodes_per_bit:7; /* RW */ + unsigned long rsvd_55:1; + unsigned long ni_port:4; /* RO */ + unsigned long rsvd_60_63:4; + } s1; + struct uvxh_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47_49:3; + unsigned long nodes_per_bit:7; /* RO */ + unsigned long ni_port:5; /* RO */ + unsigned long rsvd_62_63:2; + } sx; + struct uv2h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47_49:3; + unsigned long nodes_per_bit:7; /* RO */ + unsigned long ni_port:5; /* RO */ + unsigned long rsvd_62_63:2; + } s2; + struct uv3h_node_id_s { + unsigned long force1:1; /* RO */ + unsigned long manufacturer:11; /* RO */ + unsigned long part_number:16; /* RO */ + unsigned long revision:4; /* RO */ + unsigned long node_id:15; /* RW */ + unsigned long rsvd_47:1; + unsigned long router_select:1; /* RO */ + unsigned long rsvd_49:1; + unsigned long nodes_per_bit:7; /* RO */ + unsigned long ni_port:5; /* RO */ + unsigned long rsvd_62_63:2; + } s3; }; /* ========================================================================= */ @@ -1050,14 +2128,86 @@ union uvh_node_id_u { #define UVH_NODE_PRESENT_TABLE 0x1400UL #define UVH_NODE_PRESENT_TABLE_DEPTH 16 -#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 -#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL +#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0 +#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL union uvh_node_present_table_u { - unsigned long v; - struct uvh_node_present_table_s { - unsigned long nodes : 64; /* RW */ - } s; + unsigned long v; + struct uvh_node_present_table_s { + unsigned long nodes:64; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL + +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_alias210_overlay_config_0_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL + +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_alias210_overlay_config_1_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL + +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL +#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL + +union uvh_rh_gam_alias210_overlay_config_2_mmr_u { + unsigned long v; + struct uvh_rh_gam_alias210_overlay_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long base:8; /* RW */ + unsigned long rsvd_32_47:16; + unsigned long m_alias:5; /* RW */ + unsigned long rsvd_53_62:10; + unsigned long enable:1; /* RW */ + } s; }; /* ========================================================================= */ @@ -1069,12 +2219,12 @@ union uvh_node_present_table_u { #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL union uvh_rh_gam_alias210_redirect_config_0_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { - unsigned long rsvd_0_23 : 24; /* */ - unsigned long dest_base : 22; /* RW */ - unsigned long rsvd_46_63: 18; /* */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_0_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s; }; /* ========================================================================= */ @@ -1086,12 +2236,12 @@ union uvh_rh_gam_alias210_redirect_confi #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL union uvh_rh_gam_alias210_redirect_config_1_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { - unsigned long rsvd_0_23 : 24; /* */ - unsigned long dest_base : 22; /* RW */ - unsigned long rsvd_46_63: 18; /* */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_1_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s; }; /* ========================================================================= */ @@ -1103,109 +2253,286 @@ union uvh_rh_gam_alias210_redirect_confi #define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL union uvh_rh_gam_alias210_redirect_config_2_mmr_u { - unsigned long v; - struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { - unsigned long rsvd_0_23 : 24; /* */ - unsigned long dest_base : 22; /* RW */ - unsigned long rsvd_46_63: 18; /* */ - } s; -}; - -/* ========================================================================= */ -/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */ -/* ========================================================================= */ -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL - -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL - -union uvh_rh_gam_cfg_overlay_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_cfg_overlay_config_mmr_s { - unsigned long rsvd_0_25: 26; /* */ - unsigned long base : 20; /* RW */ - unsigned long rsvd_46_62: 17; /* */ - unsigned long enable : 1; /* RW */ - } s; + unsigned long v; + struct uvh_rh_gam_alias210_redirect_config_2_mmr_s { + unsigned long rsvd_0_23:24; + unsigned long dest_base:22; /* RW */ + unsigned long rsvd_46_63:18; + } s; +}; + +/* ========================================================================= */ +/* UVH_RH_GAM_CONFIG_MMR */ +/* ========================================================================= */ +#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL +#define UV1H_RH_GAM_CONFIG_MMR 0x1600000UL +#define UV2H_RH_GAM_CONFIG_MMR 0x1600000UL +#define UV3H_RH_GAM_CONFIG_MMR 0x1600000UL + +#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL + +#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 +#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL +#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL + +#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UVXH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UVXH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL + +#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL + +#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0 +#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 +#define UV3H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL +#define UV3H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL + +union uvh_rh_gam_config_mmr_u { + unsigned long v; + struct uvh_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s; + struct uv1h_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_11:2; + unsigned long mmiol_cfg:1; /* RW */ + unsigned long rsvd_13_63:51; + } s1; + struct uvxh_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } sx; + struct uv2h_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s2; + struct uv3h_rh_gam_config_mmr_s { + unsigned long m_skt:6; /* RW */ + unsigned long n_skt:4; /* RW */ + unsigned long rsvd_10_63:54; + } s3; }; /* ========================================================================= */ /* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL - -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL + +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UVXH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_SHFT 62 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_MODE_MASK 0x4000000000000000UL +#define UV3H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_gru_overlay_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_gru_overlay_config_mmr_s { - unsigned long rsvd_0_27: 28; /* */ - unsigned long base : 18; /* RW */ - unsigned long rsvd_46_47: 2; /* */ - unsigned long gr4 : 1; /* RW */ - unsigned long rsvd_49_51: 3; /* */ - unsigned long n_gru : 4; /* RW */ - unsigned long rsvd_56_62: 7; /* */ - unsigned long enable : 1; /* RW */ - } s; + unsigned long v; + struct uvh_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s; + struct uv1h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_47:2; + unsigned long gr4:1; /* RW */ + unsigned long rsvd_49_51:3; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s1; + struct uvxh_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } sx; + struct uv2h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s2; + struct uv3h_rh_gam_gru_overlay_config_mmr_s { + unsigned long rsvd_0_27:28; + unsigned long base:18; /* RW */ + unsigned long rsvd_46_51:6; + unsigned long n_gru:4; /* RW */ + unsigned long rsvd_56_61:6; + unsigned long mode:1; /* RW */ + unsigned long enable:1; /* RW */ + } s3; }; /* ========================================================================= */ /* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL +#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_mmioh_overlay_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_mmioh_overlay_config_mmr_s { - unsigned long rsvd_0_29: 30; /* */ - unsigned long base : 16; /* RW */ - unsigned long m_io : 6; /* RW */ - unsigned long n_io : 4; /* RW */ - unsigned long rsvd_56_62: 7; /* */ - unsigned long enable : 1; /* RW */ - } s; + unsigned long v; + struct uv1h_rh_gam_mmioh_overlay_config_mmr_s { + unsigned long rsvd_0_29:30; + unsigned long base:16; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s1; + struct uv2h_rh_gam_mmioh_overlay_config_mmr_s { + unsigned long rsvd_0_26:27; + unsigned long base:19; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; /* RW */ + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s2; }; /* ========================================================================= */ /* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */ /* ========================================================================= */ #define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL - -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 -#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL + +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL +#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UVXH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL + +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL union uvh_rh_gam_mmr_overlay_config_mmr_u { - unsigned long v; - struct uvh_rh_gam_mmr_overlay_config_mmr_s { - unsigned long rsvd_0_25: 26; /* */ - unsigned long base : 20; /* RW */ - unsigned long dual_hub : 1; /* RW */ - unsigned long rsvd_47_62: 16; /* */ - unsigned long enable : 1; /* RW */ - } s; + unsigned long v; + struct uvh_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s; + struct uv1h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long dual_hub:1; /* RW */ + unsigned long rsvd_47_62:16; + unsigned long enable:1; /* RW */ + } s1; + struct uvxh_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } sx; + struct uv2h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s2; + struct uv3h_rh_gam_mmr_overlay_config_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long rsvd_46_62:17; + unsigned long enable:1; /* RW */ + } s3; }; /* ========================================================================= */ @@ -1213,15 +2540,15 @@ union uvh_rh_gam_mmr_overlay_config_mmr_ /* ========================================================================= */ #define UVH_RTC 0x340000UL -#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 -#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL +#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0 +#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL union uvh_rtc_u { - unsigned long v; - struct uvh_rtc_s { - unsigned long real_time_clock : 56; /* RW */ - unsigned long rsvd_56_63 : 8; /* */ - } s; + unsigned long v; + struct uvh_rtc_s { + unsigned long real_time_clock:56; /* RW */ + unsigned long rsvd_56_63:8; + } s; }; /* ========================================================================= */ @@ -1229,224 +2556,341 @@ union uvh_rtc_u { /* ========================================================================= */ #define UVH_RTC1_INT_CONFIG 0x615c0UL -#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 -#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 -#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 -#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 -#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_RTC1_INT_CONFIG_P_SHFT 13 -#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_RTC1_INT_CONFIG_T_SHFT 15 -#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_RTC1_INT_CONFIG_M_SHFT 16 -#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 -#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL +#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0 +#define UVH_RTC1_INT_CONFIG_DM_SHFT 8 +#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11 +#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12 +#define UVH_RTC1_INT_CONFIG_P_SHFT 13 +#define UVH_RTC1_INT_CONFIG_T_SHFT 15 +#define UVH_RTC1_INT_CONFIG_M_SHFT 16 +#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32 +#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL +#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL +#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL +#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL +#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL +#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL +#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL +#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL union uvh_rtc1_int_config_u { - unsigned long v; - struct uvh_rtc1_int_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_RTC2_INT_CONFIG */ -/* ========================================================================= */ -#define UVH_RTC2_INT_CONFIG 0x61600UL - -#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0 -#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_RTC2_INT_CONFIG_DM_SHFT 8 -#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11 -#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12 -#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_RTC2_INT_CONFIG_P_SHFT 13 -#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_RTC2_INT_CONFIG_T_SHFT 15 -#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_RTC2_INT_CONFIG_M_SHFT 16 -#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32 -#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL - -union uvh_rtc2_int_config_u { - unsigned long v; - struct uvh_rtc2_int_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_RTC3_INT_CONFIG */ -/* ========================================================================= */ -#define UVH_RTC3_INT_CONFIG 0x61640UL - -#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0 -#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL -#define UVH_RTC3_INT_CONFIG_DM_SHFT 8 -#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL -#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11 -#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL -#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12 -#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL -#define UVH_RTC3_INT_CONFIG_P_SHFT 13 -#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL -#define UVH_RTC3_INT_CONFIG_T_SHFT 15 -#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL -#define UVH_RTC3_INT_CONFIG_M_SHFT 16 -#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL -#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32 -#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL - -union uvh_rtc3_int_config_u { - unsigned long v; - struct uvh_rtc3_int_config_s { - unsigned long vector_ : 8; /* RW */ - unsigned long dm : 3; /* RW */ - unsigned long destmode : 1; /* RW */ - unsigned long status : 1; /* RO */ - unsigned long p : 1; /* RO */ - unsigned long rsvd_14 : 1; /* */ - unsigned long t : 1; /* RO */ - unsigned long m : 1; /* RW */ - unsigned long rsvd_17_31: 15; /* */ - unsigned long apic_id : 32; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_RTC_INC_RATIO */ -/* ========================================================================= */ -#define UVH_RTC_INC_RATIO 0x350000UL - -#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0 -#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL -#define UVH_RTC_INC_RATIO_RATIO_SHFT 20 -#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL - -union uvh_rtc_inc_ratio_u { - unsigned long v; - struct uvh_rtc_inc_ratio_s { - unsigned long fraction : 20; /* RW */ - unsigned long ratio : 3; /* RW */ - unsigned long rsvd_23_63: 41; /* */ - } s; -}; - -/* ========================================================================= */ -/* UVH_SI_ADDR_MAP_CONFIG */ -/* ========================================================================= */ -#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL - -#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0 -#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL -#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8 -#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL - -union uvh_si_addr_map_config_u { - unsigned long v; - struct uvh_si_addr_map_config_s { - unsigned long m_skt : 6; /* RW */ - unsigned long rsvd_6_7: 2; /* */ - unsigned long n_skt : 4; /* RW */ - unsigned long rsvd_12_63: 52; /* */ - } s; -}; - -/* ========================================================================= */ -/* UVH_SI_ALIAS0_OVERLAY_CONFIG */ -/* ========================================================================= */ -#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL - -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24 -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48 -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63 -#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL - -union uvh_si_alias0_overlay_config_u { - unsigned long v; - struct uvh_si_alias0_overlay_config_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_SI_ALIAS1_OVERLAY_CONFIG */ -/* ========================================================================= */ -#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL - -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24 -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48 -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63 -#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL - -union uvh_si_alias1_overlay_config_u { - unsigned long v; - struct uvh_si_alias1_overlay_config_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; -}; - -/* ========================================================================= */ -/* UVH_SI_ALIAS2_OVERLAY_CONFIG */ -/* ========================================================================= */ -#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL - -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24 -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48 -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63 -#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL - -union uvh_si_alias2_overlay_config_u { - unsigned long v; - struct uvh_si_alias2_overlay_config_s { - unsigned long rsvd_0_23: 24; /* */ - unsigned long base : 8; /* RW */ - unsigned long rsvd_32_47: 16; /* */ - unsigned long m_alias : 5; /* RW */ - unsigned long rsvd_53_62: 10; /* */ - unsigned long enable : 1; /* RW */ - } s; + unsigned long v; + struct uvh_rtc1_int_config_s { + unsigned long vector_:8; /* RW */ + unsigned long dm:3; /* RW */ + unsigned long destmode:1; /* RW */ + unsigned long status:1; /* RO */ + unsigned long p:1; /* RO */ + unsigned long rsvd_14:1; + unsigned long t:1; /* RO */ + unsigned long m:1; /* RW */ + unsigned long rsvd_17_31:15; + unsigned long apic_id:32; /* RW */ + } s; +}; + +/* ========================================================================= */ +/* UVH_SCRATCH5 */ +/* ========================================================================= */ +#define UVH_SCRATCH5 0x2d0200UL +#define UVH_SCRATCH5_32 0x778 + +#define UVH_SCRATCH5_SCRATCH5_SHFT 0 +#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL + +union uvh_scratch5_u { + unsigned long v; + struct uvh_scratch5_s { + unsigned long scratch5:64; /* RW, W1CS */ + } s; +}; + +/* ========================================================================= */ +/* UVXH_EVENT_OCCURRED2 */ +/* ========================================================================= */ +#define UVXH_EVENT_OCCURRED2 0x70100UL +#define UVXH_EVENT_OCCURRED2_32 0xb68 + +#define UVXH_EVENT_OCCURRED2_RTC_0_SHFT 0 +#define UVXH_EVENT_OCCURRED2_RTC_1_SHFT 1 +#define UVXH_EVENT_OCCURRED2_RTC_2_SHFT 2 +#define UVXH_EVENT_OCCURRED2_RTC_3_SHFT 3 +#define UVXH_EVENT_OCCURRED2_RTC_4_SHFT 4 +#define UVXH_EVENT_OCCURRED2_RTC_5_SHFT 5 +#define UVXH_EVENT_OCCURRED2_RTC_6_SHFT 6 +#define UVXH_EVENT_OCCURRED2_RTC_7_SHFT 7 +#define UVXH_EVENT_OCCURRED2_RTC_8_SHFT 8 +#define UVXH_EVENT_OCCURRED2_RTC_9_SHFT 9 +#define UVXH_EVENT_OCCURRED2_RTC_10_SHFT 10 +#define UVXH_EVENT_OCCURRED2_RTC_11_SHFT 11 +#define UVXH_EVENT_OCCURRED2_RTC_12_SHFT 12 +#define UVXH_EVENT_OCCURRED2_RTC_13_SHFT 13 +#define UVXH_EVENT_OCCURRED2_RTC_14_SHFT 14 +#define UVXH_EVENT_OCCURRED2_RTC_15_SHFT 15 +#define UVXH_EVENT_OCCURRED2_RTC_16_SHFT 16 +#define UVXH_EVENT_OCCURRED2_RTC_17_SHFT 17 +#define UVXH_EVENT_OCCURRED2_RTC_18_SHFT 18 +#define UVXH_EVENT_OCCURRED2_RTC_19_SHFT 19 +#define UVXH_EVENT_OCCURRED2_RTC_20_SHFT 20 +#define UVXH_EVENT_OCCURRED2_RTC_21_SHFT 21 +#define UVXH_EVENT_OCCURRED2_RTC_22_SHFT 22 +#define UVXH_EVENT_OCCURRED2_RTC_23_SHFT 23 +#define UVXH_EVENT_OCCURRED2_RTC_24_SHFT 24 +#define UVXH_EVENT_OCCURRED2_RTC_25_SHFT 25 +#define UVXH_EVENT_OCCURRED2_RTC_26_SHFT 26 +#define UVXH_EVENT_OCCURRED2_RTC_27_SHFT 27 +#define UVXH_EVENT_OCCURRED2_RTC_28_SHFT 28 +#define UVXH_EVENT_OCCURRED2_RTC_29_SHFT 29 +#define UVXH_EVENT_OCCURRED2_RTC_30_SHFT 30 +#define UVXH_EVENT_OCCURRED2_RTC_31_SHFT 31 +#define UVXH_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL +#define UVXH_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL +#define UVXH_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL +#define UVXH_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL +#define UVXH_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL +#define UVXH_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL +#define UVXH_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL +#define UVXH_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL +#define UVXH_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL +#define UVXH_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL +#define UVXH_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL +#define UVXH_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL +#define UVXH_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL +#define UVXH_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL +#define UVXH_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL +#define UVXH_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL +#define UVXH_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL +#define UVXH_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL +#define UVXH_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL +#define UVXH_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL +#define UVXH_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL +#define UVXH_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL +#define UVXH_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL +#define UVXH_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL +#define UVXH_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL +#define UVXH_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL +#define UVXH_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL +#define UVXH_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL +#define UVXH_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL +#define UVXH_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL +#define UVXH_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL +#define UVXH_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL + +union uvxh_event_occurred2_u { + unsigned long v; + struct uvxh_event_occurred2_s { + unsigned long rtc_0:1; /* RW */ + unsigned long rtc_1:1; /* RW */ + unsigned long rtc_2:1; /* RW */ + unsigned long rtc_3:1; /* RW */ + unsigned long rtc_4:1; /* RW */ + unsigned long rtc_5:1; /* RW */ + unsigned long rtc_6:1; /* RW */ + unsigned long rtc_7:1; /* RW */ + unsigned long rtc_8:1; /* RW */ + unsigned long rtc_9:1; /* RW */ + unsigned long rtc_10:1; /* RW */ + unsigned long rtc_11:1; /* RW */ + unsigned long rtc_12:1; /* RW */ + unsigned long rtc_13:1; /* RW */ + unsigned long rtc_14:1; /* RW */ + unsigned long rtc_15:1; /* RW */ + unsigned long rtc_16:1; /* RW */ + unsigned long rtc_17:1; /* RW */ + unsigned long rtc_18:1; /* RW */ + unsigned long rtc_19:1; /* RW */ + unsigned long rtc_20:1; /* RW */ + unsigned long rtc_21:1; /* RW */ + unsigned long rtc_22:1; /* RW */ + unsigned long rtc_23:1; /* RW */ + unsigned long rtc_24:1; /* RW */ + unsigned long rtc_25:1; /* RW */ + unsigned long rtc_26:1; /* RW */ + unsigned long rtc_27:1; /* RW */ + unsigned long rtc_28:1; /* RW */ + unsigned long rtc_29:1; /* RW */ + unsigned long rtc_30:1; /* RW */ + unsigned long rtc_31:1; /* RW */ + unsigned long rsvd_32_63:32; + } sx; +}; + +/* ========================================================================= */ +/* UVXH_EVENT_OCCURRED2_ALIAS */ +/* ========================================================================= */ +#define UVXH_EVENT_OCCURRED2_ALIAS 0x70108UL +#define UVXH_EVENT_OCCURRED2_ALIAS_32 0xb70 + + +/* ========================================================================= */ +/* UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 */ +/* ========================================================================= */ +#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL +#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x320130UL + +#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 +#define UVXH_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL + +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 +#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL + +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0 +#define UV3H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL + +union uvxh_lb_bau_sb_activation_status_2_u { + unsigned long v; + struct uvxh_lb_bau_sb_activation_status_2_s { + unsigned long aux_error:64; /* RW */ + } sx; + struct uv2h_lb_bau_sb_activation_status_2_s { + unsigned long aux_error:64; /* RW */ + } s2; + struct uv3h_lb_bau_sb_activation_status_2_s { + unsigned long aux_error:64; /* RW */ + } s3; +}; + +/* ========================================================================= */ +/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */ +/* ========================================================================= */ +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0 + +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0 +#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL + +union uv1h_lb_target_physical_apic_id_mask_u { + unsigned long v; + struct uv1h_lb_target_physical_apic_id_mask_s { + unsigned long bit_enables:32; /* RW */ + unsigned long rsvd_32_63:32; + } s1; +}; + +/* ========================================================================= */ +/* UV3H_GR0_GAM_GR_CONFIG */ +/* ========================================================================= */ +#define UV3H_GR0_GAM_GR_CONFIG 0xc00028UL + +#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_SHFT 0 +#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UV3H_GR0_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL +#define UV3H_GR0_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +union uv3h_gr0_gam_gr_config_u { + unsigned long v; + struct uv3h_gr0_gam_gr_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long undef_6_9:4; /* Undefined */ + unsigned long subspace:1; /* RW */ + unsigned long reserved:53; + } s3; +}; + +/* ========================================================================= */ +/* UV3H_GR1_GAM_GR_CONFIG */ +/* ========================================================================= */ +#define UV3H_GR1_GAM_GR_CONFIG 0x1000028UL + +#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_SHFT 0 +#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_SHFT 10 +#define UV3H_GR1_GAM_GR_CONFIG_M_SKT_MASK 0x000000000000003fUL +#define UV3H_GR1_GAM_GR_CONFIG_SUBSPACE_MASK 0x0000000000000400UL + +union uv3h_gr1_gam_gr_config_u { + unsigned long v; + struct uv3h_gr1_gam_gr_config_s { + unsigned long m_skt:6; /* RW */ + unsigned long undef_6_9:4; /* Undefined */ + unsigned long subspace:1; /* RW */ + unsigned long reserved:53; + } s3; +}; + +/* ========================================================================= */ +/* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR */ +/* ========================================================================= */ +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR 0x1603000UL + +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT 26 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_SHFT 46 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_ENABLE_MASK 0x8000000000000000UL + +union uv3h_rh_gam_mmioh_overlay_config0_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_overlay_config0_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s3; +}; + +/* ========================================================================= */ +/* UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR */ +/* ========================================================================= */ +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR 0x1604000UL + +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_SHFT 26 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_SHFT 46 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_SHFT 63 +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_BASE_MASK 0x00003ffffc000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_M_IO_MASK 0x000fc00000000000UL +#define UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR_ENABLE_MASK 0x8000000000000000UL + +union uv3h_rh_gam_mmioh_overlay_config1_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_overlay_config1_mmr_s { + unsigned long rsvd_0_25:26; + unsigned long base:20; /* RW */ + unsigned long m_io:6; /* RW */ + unsigned long n_io:4; + unsigned long rsvd_56_62:7; + unsigned long enable:1; /* RW */ + } s3; +}; + +/* ========================================================================= */ +/* UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR */ +/* ========================================================================= */ +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR 0x1603800UL +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH 128 + +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_SHFT 0 +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_NASID_MASK 0x0000000000007fffUL + +union uv3h_rh_gam_mmioh_redirect_config0_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_redirect_config0_mmr_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s3; +}; + +/* ========================================================================= */ +/* UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR */ +/* ========================================================================= */ +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR 0x1604800UL +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_DEPTH 128 + +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_SHFT 0 +#define UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR_NASID_MASK 0x0000000000007fffUL + +union uv3h_rh_gam_mmioh_redirect_config1_mmr_u { + unsigned long v; + struct uv3h_rh_gam_mmioh_redirect_config1_mmr_s { + unsigned long nasid:15; /* RW */ + unsigned long rsvd_15_63:49; + } s3; }; diff -uprN linux-2.6.32/arch/x86/include/asm/vdso.h linux-2.6.32.ovz/arch/x86/include/asm/vdso.h --- linux-2.6.32/arch/x86/include/asm/vdso.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/vdso.h 2015-06-23 04:16:16.000000000 -0700 @@ -18,6 +18,7 @@ extern const char VDSO64_PRELINK[]; #if defined CONFIG_X86_32 || defined CONFIG_COMPAT extern const char VDSO32_PRELINK[]; +extern const char VDSO32_SYSENTER_RETURN[]; /* * Given a pointer to the vDSO image, find the pointer to VDSO32_name * as that symbol is defined in the vDSO sources or linker script. diff -uprN linux-2.6.32/arch/x86/include/asm/vga.h linux-2.6.32.ovz/arch/x86/include/asm/vga.h --- linux-2.6.32/arch/x86/include/asm/vga.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/vga.h 2015-03-10 13:23:31.000000000 -0700 @@ -17,4 +17,10 @@ #define vga_readb(x) (*(x)) #define vga_writeb(x, y) (*(y) = (x)) +#ifdef CONFIG_FB_EFI +#define __ARCH_HAS_VGA_DEFAULT_DEVICE +extern struct pci_dev *vga_default_device(void); +extern void vga_set_default_device(struct pci_dev *pdev); +#endif + #endif /* _ASM_X86_VGA_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/vgtod.h linux-2.6.32.ovz/arch/x86/include/asm/vgtod.h --- linux-2.6.32/arch/x86/include/asm/vgtod.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/vgtod.h 2015-06-23 04:16:16.000000000 -0700 @@ -12,6 +12,7 @@ struct vsyscall_gtod_data { u32 wall_time_nsec; int sysctl_enabled; + int gettime_monotonic_enabled; struct timezone sys_tz; struct { /* extract of a clocksource struct */ cycle_t (*vread)(void); diff -uprN linux-2.6.32/arch/x86/include/asm/vmware.h linux-2.6.32.ovz/arch/x86/include/asm/vmware.h --- linux-2.6.32/arch/x86/include/asm/vmware.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/vmware.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,27 +0,0 @@ -/* - * Copyright (C) 2008, VMware, Inc. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - */ -#ifndef ASM_X86__VMWARE_H -#define ASM_X86__VMWARE_H - -extern void vmware_platform_setup(void); -extern int vmware_platform(void); -extern void vmware_set_feature_bits(struct cpuinfo_x86 *c); - -#endif diff -uprN linux-2.6.32/arch/x86/include/asm/vmx.h linux-2.6.32.ovz/arch/x86/include/asm/vmx.h --- linux-2.6.32/arch/x86/include/asm/vmx.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/vmx.h 2015-03-10 13:24:15.000000000 -0700 @@ -25,6 +25,94 @@ * */ +#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI 0 +#define EXIT_REASON_EXTERNAL_INTERRUPT 1 +#define EXIT_REASON_TRIPLE_FAULT 2 + +#define EXIT_REASON_PENDING_INTERRUPT 7 +#define EXIT_REASON_NMI_WINDOW 8 +#define EXIT_REASON_TASK_SWITCH 9 +#define EXIT_REASON_CPUID 10 +#define EXIT_REASON_HLT 12 +#define EXIT_REASON_INVD 13 +#define EXIT_REASON_INVLPG 14 +#define EXIT_REASON_RDPMC 15 +#define EXIT_REASON_RDTSC 16 +#define EXIT_REASON_VMCALL 18 +#define EXIT_REASON_VMCLEAR 19 +#define EXIT_REASON_VMLAUNCH 20 +#define EXIT_REASON_VMPTRLD 21 +#define EXIT_REASON_VMPTRST 22 +#define EXIT_REASON_VMREAD 23 +#define EXIT_REASON_VMRESUME 24 +#define EXIT_REASON_VMWRITE 25 +#define EXIT_REASON_VMOFF 26 +#define EXIT_REASON_VMON 27 +#define EXIT_REASON_CR_ACCESS 28 +#define EXIT_REASON_DR_ACCESS 29 +#define EXIT_REASON_IO_INSTRUCTION 30 +#define EXIT_REASON_MSR_READ 31 +#define EXIT_REASON_MSR_WRITE 32 +#define EXIT_REASON_INVALID_STATE 33 +#define EXIT_REASON_MWAIT_INSTRUCTION 36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION 40 +#define EXIT_REASON_MCE_DURING_VMENTRY 41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS 44 +#define EXIT_REASON_EPT_VIOLATION 48 +#define EXIT_REASON_EPT_MISCONFIG 49 +#define EXIT_REASON_INVEPT 50 +#define EXIT_REASON_INVVPID 53 +#define EXIT_REASON_WBINVD 54 +#define EXIT_REASON_XSETBV 55 +#define EXIT_REASON_INVPCID 58 + +#define VMX_EXIT_REASONS \ + { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ + { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ + { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ + { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ + { EXIT_REASON_CPUID, "CPUID" }, \ + { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVLPG, "INVLPG" }, \ + { EXIT_REASON_RDPMC, "RDPMC" }, \ + { EXIT_REASON_RDTSC, "RDTSC" }, \ + { EXIT_REASON_VMCALL, "VMCALL" }, \ + { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ + { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ + { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ + { EXIT_REASON_VMPTRST, "VMPTRST" }, \ + { EXIT_REASON_VMREAD, "VMREAD" }, \ + { EXIT_REASON_VMRESUME, "VMRESUME" }, \ + { EXIT_REASON_VMWRITE, "VMWRITE" }, \ + { EXIT_REASON_VMOFF, "VMOFF" }, \ + { EXIT_REASON_VMON, "VMON" }, \ + { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ + { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ + { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ + { EXIT_REASON_MSR_READ, "MSR_READ" }, \ + { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ + { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ + { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ + { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ + { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ + { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_WBINVD, "WBINVD" }, \ + { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_INVVPID, "INVVPID" } + +#ifdef __KERNEL__ + +#include + /* * Definitions of Primary Processor-Based VM-Execution Controls. */ @@ -56,21 +144,31 @@ #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 +#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 #define PIN_BASED_VIRTUAL_NMIS 0x00000020 +#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 +#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 #define VM_EXIT_SAVE_IA32_PAT 0x00040000 #define VM_EXIT_LOAD_IA32_PAT 0x00080000 +#define VM_EXIT_SAVE_IA32_EFER 0x00100000 +#define VM_EXIT_LOAD_IA32_EFER 0x00200000 +#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000 +#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 +#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000 #define VM_ENTRY_LOAD_IA32_PAT 0x00004000 +#define VM_ENTRY_LOAD_IA32_EFER 0x00008000 /* VMCS Encodings */ enum vmcs_field { @@ -118,6 +216,8 @@ enum vmcs_field { GUEST_IA32_DEBUGCTL_HIGH = 0x00002803, GUEST_IA32_PAT = 0x00002804, GUEST_IA32_PAT_HIGH = 0x00002805, + GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808, + GUEST_IA32_PERF_GLOBAL_CTRL_HIGH= 0x00002809, GUEST_PDPTR0 = 0x0000280a, GUEST_PDPTR0_HIGH = 0x0000280b, GUEST_PDPTR1 = 0x0000280c, @@ -128,6 +228,8 @@ enum vmcs_field { GUEST_PDPTR3_HIGH = 0x00002811, HOST_IA32_PAT = 0x00002c00, HOST_IA32_PAT_HIGH = 0x00002c01, + HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04, + HOST_IA32_PERF_GLOBAL_CTRL_HIGH = 0x00002c05, PIN_BASED_VM_EXEC_CONTROL = 0x00004000, CPU_BASED_VM_EXEC_CONTROL = 0x00004002, EXCEPTION_BITMAP = 0x00004004, @@ -144,6 +246,8 @@ enum vmcs_field { VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, TPR_THRESHOLD = 0x0000401c, SECONDARY_VM_EXEC_CONTROL = 0x0000401e, + PLE_GAP = 0x00004020, + PLE_WINDOW = 0x00004022, VM_INSTRUCTION_ERROR = 0x00004400, VM_EXIT_REASON = 0x00004402, VM_EXIT_INTR_INFO = 0x00004404, @@ -218,43 +322,6 @@ enum vmcs_field { HOST_RIP = 0x00006c16, }; -#define VMX_EXIT_REASONS_FAILED_VMENTRY 0x80000000 - -#define EXIT_REASON_EXCEPTION_NMI 0 -#define EXIT_REASON_EXTERNAL_INTERRUPT 1 -#define EXIT_REASON_TRIPLE_FAULT 2 - -#define EXIT_REASON_PENDING_INTERRUPT 7 -#define EXIT_REASON_NMI_WINDOW 8 -#define EXIT_REASON_TASK_SWITCH 9 -#define EXIT_REASON_CPUID 10 -#define EXIT_REASON_HLT 12 -#define EXIT_REASON_INVLPG 14 -#define EXIT_REASON_RDPMC 15 -#define EXIT_REASON_RDTSC 16 -#define EXIT_REASON_VMCALL 18 -#define EXIT_REASON_VMCLEAR 19 -#define EXIT_REASON_VMLAUNCH 20 -#define EXIT_REASON_VMPTRLD 21 -#define EXIT_REASON_VMPTRST 22 -#define EXIT_REASON_VMREAD 23 -#define EXIT_REASON_VMRESUME 24 -#define EXIT_REASON_VMWRITE 25 -#define EXIT_REASON_VMOFF 26 -#define EXIT_REASON_VMON 27 -#define EXIT_REASON_CR_ACCESS 28 -#define EXIT_REASON_DR_ACCESS 29 -#define EXIT_REASON_IO_INSTRUCTION 30 -#define EXIT_REASON_MSR_READ 31 -#define EXIT_REASON_MSR_WRITE 32 -#define EXIT_REASON_MWAIT_INSTRUCTION 36 -#define EXIT_REASON_MCE_DURING_VMENTRY 41 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS 44 -#define EXIT_REASON_EPT_VIOLATION 48 -#define EXIT_REASON_EPT_MISCONFIG 49 -#define EXIT_REASON_WBINVD 54 - /* * Interruption-information format */ @@ -282,6 +349,12 @@ enum vmcs_field { #define GUEST_INTR_STATE_SMI 0x00000004 #define GUEST_INTR_STATE_NMI 0x00000008 +/* GUEST_ACTIVITY_STATE flags */ +#define GUEST_ACTIVITY_ACTIVE 0 +#define GUEST_ACTIVITY_HLT 1 +#define GUEST_ACTIVITY_SHUTDOWN 2 +#define GUEST_ACTIVITY_WAIT_SIPI 3 + /* * Exit Qualifications for MOV for Control Register Access */ @@ -358,6 +431,8 @@ enum vmcs_field { #define VMX_EPTP_UC_BIT (1ull << 8) #define VMX_EPTP_WB_BIT (1ull << 14) #define VMX_EPT_2MB_PAGE_BIT (1ull << 16) +#define VMX_EPT_1GB_PAGE_BIT (1ull << 17) +#define VMX_EPT_AD_BIT (1ull << 21) #define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) #define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) @@ -366,11 +441,14 @@ enum vmcs_field { #define VMX_EPT_MAX_GAW 0x4 #define VMX_EPT_MT_EPTE_SHIFT 3 #define VMX_EPT_GAW_EPTP_SHIFT 3 +#define VMX_EPT_AD_ENABLE_BIT (1ull << 6) #define VMX_EPT_DEFAULT_MT 0x6ull #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull #define VMX_EPT_IGMT_BIT (1ull << 6) +#define VMX_EPT_ACCESS_BIT (1ull << 8) +#define VMX_EPT_DIRTY_BIT (1ull << 9) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul @@ -387,6 +465,12 @@ enum vmcs_field { #define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08" #define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08" +struct vmx_msr_entry { + u32 index; + u32 reserved; + u64 value; +} __aligned(16); +#endif #endif diff -uprN linux-2.6.32/arch/x86/include/asm/x86_init.h linux-2.6.32.ovz/arch/x86/include/asm/x86_init.h --- linux-2.6.32/arch/x86/include/asm/x86_init.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/x86_init.h 2015-03-10 13:23:09.000000000 -0700 @@ -106,9 +106,11 @@ struct x86_init_ops { /** * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device + * @early_percpu_clock_init: early init of the per cpu clock event device */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); + void (*early_percpu_clock_init)(void); }; /** @@ -116,11 +118,20 @@ struct x86_cpuinit_ops { * @calibrate_tsc: calibrate TSC * @get_wallclock: get time from HW clock like RTC etc. * @set_wallclock: set time back to HW clock + * @is_untracked_pat_range exclude from PAT logic + * @nmi_init enable NMI on cpus + * @save_sched_clock_state: save state for sched_clock() on suspend + * @restore_sched_clock_state: restore state for sched_clock() on resume + * */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); unsigned long (*get_wallclock)(void); int (*set_wallclock)(unsigned long nowtime); + bool (*is_untracked_pat_range)(u64 start, u64 end); + void (*nmi_init)(void); + void (*save_sched_clock_state)(void); + void (*restore_sched_clock_state)(void); }; extern struct x86_init_ops x86_init; diff -uprN linux-2.6.32/arch/x86/include/asm/xen/hypercall.h linux-2.6.32.ovz/arch/x86/include/asm/xen/hypercall.h --- linux-2.6.32/arch/x86/include/asm/xen/hypercall.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/xen/hypercall.h 2015-03-10 13:21:26.000000000 -0700 @@ -417,6 +417,12 @@ HYPERVISOR_nmi_op(unsigned long op, unsi return _hypercall2(int, nmi_op, op, arg); } +static inline unsigned long __must_check +HYPERVISOR_hvm_op(int op, void *arg) +{ + return _hypercall2(unsigned long, hvm_op, op, arg); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff -uprN linux-2.6.32/arch/x86/include/asm/xen/hypervisor.h linux-2.6.32.ovz/arch/x86/include/asm/xen/hypervisor.h --- linux-2.6.32/arch/x86/include/asm/xen/hypervisor.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/xen/hypervisor.h 2015-03-10 13:21:26.000000000 -0700 @@ -33,16 +33,12 @@ #ifndef _ASM_X86_XEN_HYPERVISOR_H #define _ASM_X86_XEN_HYPERVISOR_H +#include + /* arch/i386/kernel/setup.c */ extern struct shared_info *HYPERVISOR_shared_info; extern struct start_info *xen_start_info; -enum xen_domain_type { - XEN_NATIVE, /* running on bare hardware */ - XEN_PV_DOMAIN, /* running in a PV domain */ - XEN_HVM_DOMAIN, /* running in a Xen hvm domain */ -}; - #ifdef CONFIG_XEN extern enum xen_domain_type xen_domain_type; #else diff -uprN linux-2.6.32/arch/x86/include/asm/xen/page.h linux-2.6.32.ovz/arch/x86/include/asm/xen/page.h --- linux-2.6.32/arch/x86/include/asm/xen/page.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/xen/page.h 2015-03-10 13:22:31.000000000 -0700 @@ -38,13 +38,20 @@ typedef struct xpaddr { extern unsigned long get_phys_to_machine(unsigned long pfn); extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); - +extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); static inline unsigned long pfn_to_mfn(unsigned long pfn) { + unsigned long mfn; + if (xen_feature(XENFEAT_auto_translated_physmap)) return pfn; - return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; + mfn = get_phys_to_machine(pfn); + + if (mfn != INVALID_P2M_ENTRY) + mfn &= ~FOREIGN_FRAME_BIT; + + return mfn; } static inline int phys_to_machine_mapping_valid(unsigned long pfn) diff -uprN linux-2.6.32/arch/x86/include/asm/xor_32.h linux-2.6.32.ovz/arch/x86/include/asm/xor_32.h --- linux-2.6.32/arch/x86/include/asm/xor_32.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/xor_32.h 2015-03-10 13:24:00.000000000 -0700 @@ -861,6 +861,9 @@ static struct xor_block_template xor_blo .do_5 = xor_sse_5, }; +/* Also try the AVX routines */ +#include "xor_avx.h" + /* Also try the generic routines. */ #include @@ -871,6 +874,7 @@ do { \ xor_speed(&xor_block_8regs_p); \ xor_speed(&xor_block_32regs); \ xor_speed(&xor_block_32regs_p); \ + AVX_XOR_SPEED; \ if (cpu_has_xmm) \ xor_speed(&xor_block_pIII_sse); \ if (cpu_has_mmx) { \ @@ -883,6 +887,6 @@ do { \ We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ #define XOR_SELECT_TEMPLATE(FASTEST) \ - (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) + AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST) #endif /* _ASM_X86_XOR_32_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/xor_64.h linux-2.6.32.ovz/arch/x86/include/asm/xor_64.h --- linux-2.6.32/arch/x86/include/asm/xor_64.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/xor_64.h 2015-03-10 13:24:00.000000000 -0700 @@ -347,15 +347,21 @@ static struct xor_block_template xor_blo .do_5 = xor_sse_5, }; + +/* Also try the AVX routines */ +#include "xor_avx.h" + #undef XOR_TRY_TEMPLATES #define XOR_TRY_TEMPLATES \ do { \ + AVX_XOR_SPEED; \ xor_speed(&xor_block_sse); \ } while (0) /* We force the use of the SSE xor block because it can write around L2. We may also be able to load into the L1 only depending on how the cpu deals with a load to a line that is being prefetched. */ -#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse) +#define XOR_SELECT_TEMPLATE(FASTEST) \ + AVX_SELECT(&xor_block_sse) #endif /* _ASM_X86_XOR_64_H */ diff -uprN linux-2.6.32/arch/x86/include/asm/xor_avx.h linux-2.6.32.ovz/arch/x86/include/asm/xor_avx.h --- linux-2.6.32/arch/x86/include/asm/xor_avx.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/xor_avx.h 2015-03-10 13:24:05.000000000 -0700 @@ -0,0 +1,214 @@ +#ifndef _ASM_X86_XOR_AVX_H +#define _ASM_X86_XOR_AVX_H + +/* + * Optimized RAID-5 checksumming functions for AVX + * + * Copyright (C) 2012 Intel Corporation + * Author: Jim Kukunas + * + * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifdef CONFIG_AS_AVX + +#include +#include + +#define ALIGN32 __aligned(32) + +#define YMM_SAVED_REGS 4 + +#define YMMS_SAVE \ +do { \ + preempt_disable(); \ + cr0 = read_cr0(); \ + clts(); \ + asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ + asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ + asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ + asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ +} while (0); + +#define YMMS_RESTORE \ +do { \ + asm volatile("sfence" : : : "memory"); \ + asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ + asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ + asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ + asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ + write_cr0(cr0); \ + preempt_enable(); \ +} while (0); + +#define BLOCK4(i) \ + BLOCK(32 * i, 0) \ + BLOCK(32 * (i + 1), 1) \ + BLOCK(32 * (i + 2), 2) \ + BLOCK(32 * (i + 3), 3) + +#define BLOCK16() \ + BLOCK4(0) \ + BLOCK4(4) \ + BLOCK4(8) \ + BLOCK4(12) + +static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16(); + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + } + + YMMS_RESTORE +} + +static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, + unsigned long *p2, unsigned long *p3, unsigned long *p4) +{ + unsigned long cr0, lines = bytes >> 9; + char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; + + YMMS_SAVE + + while (lines--) { +#undef BLOCK +#define BLOCK(i, reg) \ +do { \ + asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p3[i / sizeof(*p3)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p2[i / sizeof(*p2)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p1[i / sizeof(*p1)])); \ + asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ + "m" (p0[i / sizeof(*p0)])); \ + asm volatile("vmovdqa %%ymm" #reg ", %0" : \ + "=m" (p0[i / sizeof(*p0)])); \ +} while (0); + + BLOCK16() + + p0 = (unsigned long *)((uintptr_t)p0 + 512); + p1 = (unsigned long *)((uintptr_t)p1 + 512); + p2 = (unsigned long *)((uintptr_t)p2 + 512); + p3 = (unsigned long *)((uintptr_t)p3 + 512); + p4 = (unsigned long *)((uintptr_t)p4 + 512); + } + + YMMS_RESTORE +} + +static struct xor_block_template xor_block_avx = { + .name = "avx", + .do_2 = xor_avx_2, + .do_3 = xor_avx_3, + .do_4 = xor_avx_4, + .do_5 = xor_avx_5, +}; + +#define AVX_XOR_SPEED \ +do { \ + if (cpu_has_avx && cpu_has_osxsave) \ + xor_speed(&xor_block_avx); \ +} while (0) + +#define AVX_SELECT(FASTEST) \ + (cpu_has_avx && cpu_has_osxsave ? &xor_block_avx : FASTEST) + +#else + +#define AVX_XOR_SPEED {} + +#define AVX_SELECT(FASTEST) (FASTEST) + +#endif +#endif diff -uprN linux-2.6.32/arch/x86/include/asm/xsave.h linux-2.6.32.ovz/arch/x86/include/asm/xsave.h --- linux-2.6.32/arch/x86/include/asm/xsave.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/include/asm/xsave.h 2015-03-10 13:24:13.000000000 -0700 @@ -8,15 +8,28 @@ #define XSTATE_FP 0x1 #define XSTATE_SSE 0x2 #define XSTATE_YMM 0x4 +#define XSTATE_OPMASK 0x20 +#define XSTATE_ZMM_Hi256 0x40 +#define XSTATE_Hi16_ZMM 0x80 #define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE) #define FXSAVE_SIZE 512 +#define XSTATE_YMM_SIZE 256 +#define XSTATE_YMM_OFFSET (512 + 64) + +#define XSAVE_HDR_SIZE 64 +#define XSAVE_HDR_OFFSET FXSAVE_SIZE + +#define XSAVE_YMM_SIZE 256 +#define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) + /* * These are the features that the OS can handle currently. */ -#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) +#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE | XSTATE_YMM \ + | XSTATE_OPMASK | XSTATE_ZMM_Hi256 | XSTATE_Hi16_ZMM) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " @@ -27,16 +40,19 @@ extern unsigned int xstate_size; extern u64 pcntxt_mask; extern struct xsave_struct *init_xstate_buf; +extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void xsave_cntxt_init(void); extern void xsave_init(void); +extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); extern int init_fpu(struct task_struct *child); extern int check_for_xstate(struct i387_fxsave_struct __user *buf, void __user *fpstate, struct _fpx_sw_bytes *sw); -static inline int xrstor_checking(struct xsave_struct *fx) +static inline int fpu_xrstor_checking(struct fpu *fpu) { + struct xsave_struct *fx = &fpu->state->xsave; int err; asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" @@ -108,12 +124,25 @@ static inline void xrstor_state(struct x : "memory"); } -static inline void xsave(struct task_struct *tsk) +static inline void xsave_state(struct xsave_struct *fx, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + + asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); +} + +static inline void fpu_xsave(struct fpu *fpu) { /* This, however, we can work around by forcing the compiler to select an addressing mode that doesn't require extended registers. */ - __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27" - : : "D" (&(tsk->thread.xstate->xsave)), - "a" (-1), "d"(-1) : "memory"); + alternative_input( + ".byte " REX_PREFIX "0x0f,0xae,0x27", + ".byte " REX_PREFIX "0x0f,0xae,0x37", + X86_FEATURE_XSAVEOPT, + [fx] "D" (&(fpu->state->xsave)), "a" (-1), "d" (-1) : + "memory"); } #endif diff -uprN linux-2.6.32/arch/x86/Kconfig linux-2.6.32.ovz/arch/x86/Kconfig --- linux-2.6.32/arch/x86/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/Kconfig 2015-06-23 04:16:16.000000000 -0700 @@ -25,12 +25,14 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_PERF_EVENTS if (!M386 && !M486) + select HAVE_IRQ_WORK select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS select HAVE_KRETPROBES + select HAVE_OPTPROBES select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FUNCTION_TRACER @@ -49,7 +51,12 @@ config X86 select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA + select HAVE_PERF_EVENTS_NMI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_ARCH_KMEMCHECK + select HAVE_USER_RETURN_NOTIFIER + select ARCH_HAVE_NMI_SAFE_CMPXCHG config OUTPUT_FORMAT string @@ -61,9 +68,6 @@ config ARCH_DEFCONFIG default "arch/x86/configs/i386_defconfig" if X86_32 default "arch/x86/configs/x86_64_defconfig" if X86_64 -config GENERIC_TIME - def_bool y - config GENERIC_CMOS_UPDATE def_bool y @@ -95,6 +99,9 @@ config ZONE_DMA config SBUS bool +config NEED_DMA_MAP_STATE + def_bool (X86_64 || DMAR || DMA_API_DEBUG) + config GENERIC_ISA_DMA def_bool y @@ -622,7 +629,7 @@ config GART_IOMMU bool "GART IOMMU support" if EMBEDDED default y select SWIOTLB - depends on X86_64 && PCI + depends on X86_64 && PCI && AMD_NB ---help--- Support for full DMA access of devices with 32bit memory access only on systems with more than 3GB. This is usually needed for USB, @@ -1110,6 +1117,9 @@ config X86_PAE config ARCH_PHYS_ADDR_T_64BIT def_bool X86_64 || X86_PAE +config ARCH_DMA_ADDR_T_64BIT + def_bool X86_64 || HIGHMEM64G + config DIRECT_GBPAGES bool "Enable 1GB pages for kernel pagetables" if EMBEDDED default y @@ -1119,6 +1129,17 @@ config DIRECT_GBPAGES support it. This can improve the kernel's performance a tiny bit by reducing TLB pressure. If in doubt, say "Y". +config TRACK_DIRTY_PAGES + bool "Enable dirty page tracking" + default n + depends on !KMEMCHECK + ---help--- + Turning this on allows third party modules to use a + kernel interface that can track dirty page generation + in the system. This can be used to copy/mirror live + memory to another system or node. Most users will + say n here. + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" @@ -1144,16 +1165,16 @@ config NUMA comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) -config K8_NUMA +config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" depends on X86_64 && NUMA && PCI ---help--- - Enable K8 NUMA node topology detection. You should say Y here if - you have a multi processor AMD K8 system. This uses an old - method to read the NUMA configuration directly from the builtin - Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA - instead, which also takes priority if both are compiled in. + Enable AMD NUMA node topology detection. You should say Y here if + you have a multi processor AMD system. This uses an old method to + read the NUMA configuration directly from the builtin Northbridge + of Opteron. It is recommended to use X86_64_ACPI_NUMA instead, + which also takes priority if both are compiled in. config X86_64_ACPI_NUMA def_bool y @@ -1242,6 +1263,11 @@ config ARCH_MEMORY_PROBE def_bool X86_64 depends on MEMORY_HOTPLUG +config ILLEGAL_POINTER_VALUE + hex + default 0 if X86_32 + default 0xdead000000000000 if X86_64 + source "mm/Kconfig" config HIGHPTE @@ -1413,6 +1439,15 @@ config ARCH_USES_PG_UNCACHED def_bool y depends on X86_PAT +config ARCH_RANDOM + def_bool y + prompt "x86 architectural random number generator" if EXPERT + ---help--- + Enable the x86 architectural RDRAND instruction + (Intel Bull Mountain technology) to generate random numbers. + If supported, this is a high bandwidth, cryptographically + secure hardware random number generator. + config EFI bool "EFI runtime service support" depends on ACPI @@ -1477,6 +1512,34 @@ config KEXEC support. As of this writing the exact hardware interface is strongly in flux, so no good recommendation can be made. +config KEXEC_AUTO_RESERVE + bool "automatically reserve memory for kexec kernel" + depends on KEXEC + default y + ---help--- + Automatically reserve memory for a kexec kernel, so that you don't + need to specify numbers for the "crashkernel=X@Y" boot option, + instead you can use "crashkernel=auto". To make this work, you need + to have more than 4G memory. + + On x86_32, 128M is reserved, on x86_64 1/32 of your memory is + reserved, but it will not exceed 4G. + +config KEXEC_REUSE_CRASH + bool "try to reuse crashkernel when booting via kexec" + depends on KEXEC && PRAM + default y + ---help--- + Reuse crashkernel left from the previous kernel if booting via + kexec, so that early init bugs can be debugged. + + The option is actually enabled by writing 1 to + /proc/sys/kernel/kexec_reuse_crash before calling kexec. + + Note that crashkernel areas (specified by the crashkernel boot + option) of the new and the previous kernels must coincide for + the crashkernel to be reused. + config CRASH_DUMP bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) @@ -1998,7 +2061,7 @@ config SCx200 config SCx200HR_TIMER tristate "NatSemi SCx200 27MHz High-Resolution Timer Support" - depends on SCx200 && GENERIC_TIME + depends on SCx200 default y ---help--- This driver provides a clocksource built upon the on-chip @@ -2026,9 +2089,9 @@ config OLPC endif # X86_32 -config K8_NB +config AMD_NB def_bool y - depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) + depends on CPU_SUP_AMD && PCI source "drivers/pcmcia/Kconfig" @@ -2068,6 +2131,11 @@ config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC +config KEYS_COMPAT + bool + depends on COMPAT && KEYS + default y + endmenu @@ -2075,6 +2143,8 @@ config HAVE_ATOMIC_IOMAP def_bool y depends on X86_32 +source "kernel/Kconfig.openvz" + source "net/Kconfig" source "drivers/Kconfig" @@ -2092,3 +2162,5 @@ source "crypto/Kconfig" source "arch/x86/kvm/Kconfig" source "lib/Kconfig" + +source "kernel/bc/Kconfig" diff -uprN linux-2.6.32/arch/x86/Kconfig.cpu linux-2.6.32.ovz/arch/x86/Kconfig.cpu --- linux-2.6.32/arch/x86/Kconfig.cpu 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/Kconfig.cpu 2015-03-10 13:22:35.000000000 -0700 @@ -323,7 +323,7 @@ config X86_L1_CACHE_SHIFT config X86_XADD def_bool y - depends on X86_32 && !M386 + depends on X86_64 || !M386 config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" @@ -342,6 +342,10 @@ config X86_F00F_BUG def_bool y depends on M586MMX || M586TSC || M586 || M486 || M386 +config X86_INVD_BUG + def_bool y + depends on M486 || M386 + config X86_WP_WORKS_OK def_bool y depends on !M386 @@ -400,7 +404,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on !M386 && !M486 + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM # this should be set for all -march=.. options where the compiler # generates cmov. @@ -506,23 +510,3 @@ config CPU_SUP_UMC_32 CPU might render the kernel unbootable. If unsure, say N. - -config X86_DS - def_bool X86_PTRACE_BTS - depends on X86_DEBUGCTLMSR - select HAVE_HW_BRANCH_TRACER - -config X86_PTRACE_BTS - bool "Branch Trace Store" - default y - depends on X86_DEBUGCTLMSR - depends on BROKEN - ---help--- - This adds a ptrace interface to the hardware's branch trace store. - - Debuggers may use it to collect an execution trace of the debugged - application in order to answer the question 'how did I get here?'. - Debuggers may trace user mode as well as kernel mode. - - Say Y unless there is no application development on this machine - and you want to save a small amount of code size. diff -uprN linux-2.6.32/arch/x86/Kconfig.debug linux-2.6.32.ovz/arch/x86/Kconfig.debug --- linux-2.6.32/arch/x86/Kconfig.debug 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/Kconfig.debug 2015-06-23 04:16:16.000000000 -0700 @@ -56,6 +56,15 @@ config EARLY_PRINTK_DBGP with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. You need usb debug device. +config EARLY_PRINTK_EFI + bool "Early printk via the EFI framebuffer" + depends on EFI && EARLY_PRINTK + ---help--- + Write kernel log output directly into the EFI framebuffer. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL @@ -133,6 +142,10 @@ config 4KSTACKS on the VM subsystem for higher order allocations. This option will also use IRQ stacks to compensate for the reduced stackspace. +config 16KSTACKS + bool "Use 16Kb for kernel stacks instead of 8Kb" + depends on X86_64 + config DOUBLEFAULT default y bool "Enable doublefault exception handler" if EMBEDDED @@ -174,18 +187,18 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. -config X86_DS_SELFTEST - bool "DS selftest" - default y - depends on DEBUG_KERNEL - depends on X86_DS - ---help--- - Perform Debug Store selftests at boot time. - If in doubt, say "N". - config HAVE_MMIOTRACE_SUPPORT def_bool y +config X86_DECODER_SELFTEST + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL + ---help--- + Perform x86 instruction decoder selftests at build time. + This option is useful for checking the sanity of x86 instruction + decoder code. + If unsure, say "N". + # # IO delay types: # diff -uprN linux-2.6.32/arch/x86/kernel/acpi/boot.c linux-2.6.32.ovz/arch/x86/kernel/acpi/boot.c --- linux-2.6.32/arch/x86/kernel/acpi/boot.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/acpi/boot.c 2015-03-10 13:24:07.000000000 -0700 @@ -49,6 +49,7 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 # include +# include #endif /* X86 */ #define BAD_MADT_ENTRY(entry, end) ( \ @@ -65,6 +66,7 @@ int acpi_ht __initdata = 1; /* enable HT int acpi_lapic; int acpi_ioapic; int acpi_strict; +int acpi_disable_cmcff; u8 acpi_sci_flags __initdata; int acpi_sci_override_gsi __initdata; @@ -149,6 +151,11 @@ static void __cpuinit acpi_register_lapi { unsigned int ver = 0; + if (id >= (MAX_LOCAL_APIC-1)) { + printk(KERN_INFO PREFIX "skipped apicid that is too big\n"); + return; + } + if (!enabled) { ++disabled_cpus; return; @@ -446,8 +453,15 @@ void __init acpi_pic_sci_set_trigger(uns int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { *irq = gsi; + +#ifdef CONFIG_X86_IO_APIC + if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) + setup_IO_APIC_irq_extra(gsi); +#endif + return 0; } +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); /* * success: return IRQ number (>=0) @@ -473,7 +487,8 @@ int acpi_register_gsi(struct device *dev plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif - acpi_gsi_to_irq(plat_gsi, &irq); + irq = plat_gsi; + return irq; } @@ -482,6 +497,25 @@ int acpi_register_gsi(struct device *dev */ #ifdef CONFIG_ACPI_HOTPLUG_CPU +static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid) +{ +#ifdef CONFIG_ACPI_NUMA + int nid; + + nid = acpi_get_node(handle); + if (nid == -1 || !node_online(nid)) + return; +#ifdef CONFIG_X86_64 + apicid_to_node[physid] = nid; + numa_set_node(cpu, nid); +#else /* CONFIG_X86_32 */ + apicid_2_node[physid] = nid; + cpu_to_node_map[cpu] = nid; +#endif + +#endif +} + static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; @@ -518,6 +552,7 @@ static int __cpuinit _acpi_map_lsapic(ac kfree(buffer.pointer); buffer.length = ACPI_ALLOCATE_BUFFER; buffer.pointer = NULL; + lapic = NULL; if (!alloc_cpumask_var(&tmp_map, GFP_KERNEL)) goto out; @@ -526,7 +561,7 @@ static int __cpuinit _acpi_map_lsapic(ac goto free_tmp_map; cpumask_copy(tmp_map, cpu_present_mask); - acpi_register_lapic(physid, lapic->lapic_flags & ACPI_MADT_ENABLED); + acpi_register_lapic(physid, ACPI_MADT_ENABLED); /* * If mp_register_lapic successfully generates a new logical cpu @@ -540,6 +575,7 @@ static int __cpuinit _acpi_map_lsapic(ac } cpu = cpumask_first(new_map); + acpi_map_cpu2node(handle, cpu, physid); *pcpu = cpu; retval = 0; @@ -624,6 +660,7 @@ static int __init acpi_parse_hpet(struct } hpet_address = hpet_tbl->address.address; + hpet_blockid = hpet_tbl->sequence; /* * Some broken BIOSes advertise HPET at 0x0. We really do not @@ -1184,9 +1221,6 @@ static void __init acpi_process_madt(voi if (!error) { acpi_lapic = 1; -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif /* * Parse MADT IO-APIC entries */ @@ -1196,8 +1230,6 @@ static void __init acpi_process_madt(voi acpi_ioapic = 1; smp_found_config = 1; - if (apic->setup_apic_routing) - apic->setup_apic_routing(); } } if (error == -EINVAL) { @@ -1348,14 +1380,6 @@ static struct dmi_system_id __initdata a }, { .callback = force_acpi_ht, - .ident = "ASUS P2B-DS", - .matches = { - DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), - DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"), - }, - }, - { - .callback = force_acpi_ht, .ident = "ASUS CUR-DLS", .matches = { DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), @@ -1461,6 +1485,15 @@ static struct dmi_system_id __initdata a DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), }, }, + { + .callback = disable_acpi_pci, + .ident = "HP xw9300 Workstation", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP xw9300 Workstation"), + }, + }, + {} }; @@ -1653,6 +1686,10 @@ static int __init parse_acpi(char *arg) /* "acpi=noirq" disables ACPI interrupt routing */ else if (strcmp(arg, "noirq") == 0) { acpi_noirq_set(); + } + /* "acpi=nocmcff" disables FF mode for corrected errors */ + else if (strcmp(arg, "nocmcff") == 0) { + acpi_disable_cmcff = 1; } else { /* Core will printk when we return error. */ return -EINVAL; diff -uprN linux-2.6.32/arch/x86/kernel/acpi/cstate.c linux-2.6.32.ovz/arch/x86/kernel/acpi/cstate.c --- linux-2.6.32/arch/x86/kernel/acpi/cstate.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/acpi/cstate.c 2015-03-10 13:22:11.000000000 -0700 @@ -13,6 +13,7 @@ #include #include +#include /* * Initialize bm_flags based on the CPU cache properties @@ -48,7 +49,7 @@ void acpi_processor_power_init_bm_check( * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); @@ -65,16 +66,6 @@ static struct cstate_entry *cpu_cstate_e static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; -#define MWAIT_SUBSTATE_MASK (0xf) -#define MWAIT_CSTATE_MASK (0xf) -#define MWAIT_SUBSTATE_SIZE (4) - -#define CPUID_MWAIT_LEAF (5) -#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1) -#define CPUID5_ECX_INTERRUPT_BREAK (0x2) - -#define MWAIT_ECX_INTERRUPT_BREAK (0x1) - #define NATIVE_CSTATE_BEYOND_HALT (2) static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) @@ -145,6 +136,15 @@ int acpi_processor_ffh_cstate_probe(unsi percpu_entry->states[cx->index].eax = cx->address; percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; } + + /* + * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared, + * then we should skip checking BM_STS for this C-state. + * ref: "Intel Processor Vendor-Specific ACPI Interface Specification" + */ + if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2)) + cx->bm_sts_skip = 1; + return retval; } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); diff -uprN linux-2.6.32/arch/x86/kernel/alternative.c linux-2.6.32.ovz/arch/x86/kernel/alternative.c --- linux-2.6.32/arch/x86/kernel/alternative.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/alternative.c 2015-03-10 13:22:45.000000000 -0700 @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,19 @@ static int __init setup_noreplace_smp(ch } __setup("noreplace-smp", setup_noreplace_smp); +static enum {SL_DEFAULT, SL_TICKET, SL_UNFAIR} spinlock_type = SL_DEFAULT; + +static int __init setup_spinlock_type(char *str) +{ + if (!strcmp("ticket", str)) + spinlock_type = SL_TICKET; + else if (!strcmp("unfair", str)) + spinlock_type = SL_UNFAIR; + + return 1; +} +__setup("spinlock-type=", setup_spinlock_type); + #ifdef CONFIG_PARAVIRT static int __initdata_or_module noreplace_paravirt = 0; @@ -202,14 +216,39 @@ static void *text_poke_early(void *addr, Tough. Make sure you disable such features by hand. */ void __init_or_module apply_alternatives(struct alt_instr *start, - struct alt_instr *end) + struct alt_instr *end, + int fixup) { struct alt_instr *a; - char insnbuf[MAX_PATCH_LEN]; + u8 insnbuf[MAX_PATCH_LEN]; + int count = 0; - DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + DPRINTK("%s: alt table %p -> %p fixup = %d\n", __func__, start, end, + fixup); + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite a previous scanned alternative code. + * Some kernel functions (e.g. memcpy, memset, etc) use this order to + * patch code. + * + * So be careful if you want to change the scan order to any other + * order. + */ for (a = start; a < end; a++) { u8 *instr = a->instr; + + DPRINTK("#%d: cpuid = %d instrlen = %d replacementlen = %d\n", + count, a->cpuid, a->instrlen, a->replacementlen); + if (fixup) { + DPRINTK("#%d: cpuid = %d instrlen = %d replacementlen = %d\n", + count, 0x00ff & a->cpuid, + (0xff00 & a->cpuid) >> 8, a->instrlen); + a->replacementlen = a->instrlen; + a->instrlen = (0xff00 & a->cpuid) >> 8; + a->cpuid = 0x00ff & a->cpuid; + } + count++; + BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); if (!boot_cpu_has(a->cpuid)) @@ -223,6 +262,8 @@ void __init_or_module apply_alternatives } #endif memcpy(insnbuf, a->replacement, a->replacementlen); + if (*insnbuf == 0xe8 && a->replacementlen == 5) + *(s32 *)(insnbuf + 1) += a->replacement - a->instr; add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); text_poke_early(instr, insnbuf, a->instrlen); @@ -390,6 +431,24 @@ void alternatives_smp_switch(int smp) mutex_unlock(&smp_alt); } +/* Return 1 if the address range is reserved for smp-alternatives */ +int alternatives_text_reserved(void *start, void *end) +{ + struct smp_alt_module *mod; + u8 **ptr; + u8 *text_start = start; + u8 *text_end = end; + + list_for_each_entry(mod, &smp_alt_modules, next) { + if (mod->text > text_end || mod->text_end < text_start) + continue; + for (ptr = mod->locks; ptr < mod->locks_end; ptr++) + if (text_start <= *ptr && text_end >= *ptr) + return 1; + } + + return 0; +} #endif #ifdef CONFIG_PARAVIRT @@ -429,6 +488,14 @@ void __init alternative_instructions(voi Other CPUs are not running. */ stop_nmi(); + if (spinlock_type == SL_DEFAULT) + spinlock_type = boot_cpu_has(X86_FEATURE_HYPERVISOR) ? + SL_UNFAIR : SL_TICKET; + + if (spinlock_type == SL_UNFAIR) { + printk(KERN_INFO "alternatives: switching to unfair spinlock\n"); + setup_force_cpu_cap(X86_FEATURE_UNFAIR_SPINLOCK); + } /* * Don't stop machine check exceptions while patching. * MCEs only happen when something got corrupted and in this @@ -440,7 +507,7 @@ void __init alternative_instructions(voi * patching. */ - apply_alternatives(__alt_instructions, __alt_instructions_end); + apply_alternatives(__alt_instructions, __alt_instructions_end, 0); /* switch to patch-once-at-boottime-only mode and free the * tables in case we know the number of CPUs will never ever @@ -552,3 +619,63 @@ void *__kprobes text_poke(void *addr, co local_irq_restore(flags); return addr; } + +/* + * Cross-modifying kernel text with stop_machine(). + * This code originally comes from immediate value. + */ +static atomic_t stop_machine_first; +static int wrote_text; + +struct text_poke_params { + void *addr; + const void *opcode; + size_t len; +}; + +static int __kprobes stop_machine_text_poke(void *data) +{ + struct text_poke_params *tpp = data; + + if (atomic_dec_and_test(&stop_machine_first)) { + text_poke(tpp->addr, tpp->opcode, tpp->len); + smp_wmb(); /* Make sure other cpus see that this has run */ + wrote_text = 1; + } else { + while (!wrote_text) + cpu_relax(); + smp_mb(); /* Load wrote_text before following execution */ + } + + flush_icache_range((unsigned long)tpp->addr, + (unsigned long)tpp->addr + tpp->len); + return 0; +} + +/** + * text_poke_smp - Update instructions on a live kernel on SMP + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Modify multi-byte instruction by using stop_machine() on SMP. This allows + * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying + * should be allowed, since stop_machine() does _not_ protect code against + * NMI and MCE. + * + * Note: Must be called under get_online_cpus() and text_mutex. + */ +void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) +{ + struct text_poke_params tpp; + + tpp.addr = addr; + tpp.opcode = opcode; + tpp.len = len; + atomic_set(&stop_machine_first, 1); + wrote_text = 0; + /* Use __stop_machine() because the caller already got online_cpus. */ + stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask); + return addr; +} + diff -uprN linux-2.6.32/arch/x86/kernel/amd_iommu.c linux-2.6.32.ovz/arch/x86/kernel/amd_iommu.c --- linux-2.6.32/arch/x86/kernel/amd_iommu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/amd_iommu.c 2015-03-10 13:24:05.000000000 -0700 @@ -28,8 +28,10 @@ #include #include #include +#include #include #include +#include #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) @@ -49,6 +51,8 @@ static struct protection_domain *pt_doma static struct iommu_ops amd_iommu_ops; +static struct dma_map_ops amd_iommu_dma_ops; + /* * general struct to manage commands send to an IOMMU */ @@ -60,7 +64,7 @@ static int dma_ops_unity_map(struct dma_ struct unity_map_entry *e); static struct dma_ops_domain *find_protection_domain(u16 devid); static u64 *alloc_pte(struct protection_domain *domain, - unsigned long address, int end_lvl, + unsigned long address, unsigned long page_size, u64 **pte_page, gfp_t gfp); static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, @@ -268,6 +272,7 @@ static int __iommu_queue_command(struct u32 tail, head; u8 *target; + WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED); tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); @@ -540,11 +545,11 @@ static void flush_all_devices_for_iommu( static void flush_devices_by_domain(struct protection_domain *domain) { struct amd_iommu *iommu; - int i; + unsigned long i; for (i = 0; i <= amd_iommu_last_bdf; ++i) { if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || - (amd_iommu_pd_table[i] != domain)) + (domain != NULL && amd_iommu_pd_table[i] != domain)) continue; iommu = amd_iommu_rlookup_table[i]; @@ -595,31 +600,36 @@ static int iommu_map_page(struct protect unsigned long bus_addr, unsigned long phys_addr, int prot, - int map_size) + unsigned long page_size) { u64 __pte, *pte; - - bus_addr = PAGE_ALIGN(bus_addr); - phys_addr = PAGE_ALIGN(phys_addr); - - BUG_ON(!PM_ALIGNED(map_size, bus_addr)); - BUG_ON(!PM_ALIGNED(map_size, phys_addr)); + int i, count; if (!(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); + bus_addr = PAGE_ALIGN(bus_addr); + phys_addr = PAGE_ALIGN(phys_addr); + count = PAGE_SIZE_PTE_COUNT(page_size); + pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL); - if (IOMMU_PTE_PRESENT(*pte)) - return -EBUSY; + for (i = 0; i < count; ++i) + if (IOMMU_PTE_PRESENT(pte[i])) + return -EBUSY; + + if (page_size > PAGE_SIZE) { + __pte = PAGE_SIZE_PTE(phys_addr, page_size); + __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; + } else + __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; - __pte = phys_addr | IOMMU_PTE_P; if (prot & IOMMU_PROT_IR) __pte |= IOMMU_PTE_IR; if (prot & IOMMU_PROT_IW) __pte |= IOMMU_PTE_IW; - *pte = __pte; + for (i = 0; i < count; ++i) + pte[i] = __pte; update_domain(dom); @@ -688,7 +698,7 @@ static int dma_ops_unity_map(struct dma_ for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, - PM_MAP_4k); + PAGE_SIZE); if (ret) return ret; /* @@ -780,7 +790,7 @@ static int alloc_new_range(struct amd_io bool populate, gfp_t gfp) { int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; - int i; + unsigned long i, old_size; #ifdef CONFIG_IOMMU_STRESS populate = false; @@ -805,7 +815,7 @@ static int alloc_new_range(struct amd_io u64 *pte, *pte_page; for (i = 0; i < num_ptes; ++i) { - pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, + pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE, &pte_page, gfp); if (!pte) goto out_free; @@ -816,8 +826,21 @@ static int alloc_new_range(struct amd_io } } + old_size = dma_dom->aperture_size; dma_dom->aperture_size += APERTURE_RANGE_SIZE; + /* Reserve address range used for MSI messages */ + if (old_size < MSI_ADDR_BASE_LO && + dma_dom->aperture_size > MSI_ADDR_BASE_LO) { + unsigned long spage; + int pages; + + pages = iommu_num_pages(MSI_ADDR_BASE_LO, 0x10000, PAGE_SIZE); + spage = MSI_ADDR_BASE_LO >> PAGE_SHIFT; + + dma_ops_reserve_addresses(dma_dom, spage, pages); + } + /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start >= dma_dom->aperture[index]->offset && @@ -842,7 +865,7 @@ static int alloc_new_range(struct amd_io if (!pte || !IOMMU_PTE_PRESENT(*pte)) continue; - dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1); + dma_ops_reserve_addresses(dma_dom, i >> PAGE_SHIFT, 1); } update_domain(&dma_dom->domain); @@ -1020,34 +1043,59 @@ static void dma_ops_reserve_addresses(st } } +#define DEFINE_FREE_PT_FN(LVL, FN) \ +static void free_pt_##LVL (unsigned long __pt) \ +{ \ + unsigned long p; \ + u64 *pt; \ + int i; \ + \ + pt = (u64 *)__pt; \ + \ + for (i = 0; i < 512; ++i) { \ + if (!IOMMU_PTE_PRESENT(pt[i])) \ + continue; \ + \ + p = (unsigned long)IOMMU_PTE_PAGE(pt[i]); \ + FN(p); \ + } \ + free_page((unsigned long)pt); \ +} + +DEFINE_FREE_PT_FN(l2, free_page) +DEFINE_FREE_PT_FN(l3, free_pt_l2) +DEFINE_FREE_PT_FN(l4, free_pt_l3) +DEFINE_FREE_PT_FN(l5, free_pt_l4) +DEFINE_FREE_PT_FN(l6, free_pt_l5) + static void free_pagetable(struct protection_domain *domain) { - int i, j; - u64 *p1, *p2, *p3; - - p1 = domain->pt_root; - - if (!p1) - return; - - for (i = 0; i < 512; ++i) { - if (!IOMMU_PTE_PRESENT(p1[i])) - continue; - - p2 = IOMMU_PTE_PAGE(p1[i]); - for (j = 0; j < 512; ++j) { - if (!IOMMU_PTE_PRESENT(p2[j])) - continue; - p3 = IOMMU_PTE_PAGE(p2[j]); - free_page((unsigned long)p3); - } + unsigned long root = (unsigned long)domain->pt_root; - free_page((unsigned long)p2); + switch (domain->mode) { + case PAGE_MODE_NONE: + break; + case PAGE_MODE_1_LEVEL: + free_page(root); + break; + case PAGE_MODE_2_LEVEL: + free_pt_l2(root); + break; + case PAGE_MODE_3_LEVEL: + free_pt_l3(root); + break; + case PAGE_MODE_4_LEVEL: + free_pt_l4(root); + break; + case PAGE_MODE_5_LEVEL: + free_pt_l5(root); + break; + case PAGE_MODE_6_LEVEL: + free_pt_l6(root); + break; + default: + BUG(); } - - free_page((unsigned long)p1); - - domain->pt_root = NULL; } /* @@ -1208,6 +1256,7 @@ static void attach_device(struct amd_iom */ static void __detach_device(struct protection_domain *domain, u16 devid) { + struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; /* lock domain */ spin_lock(&domain->lock); @@ -1230,12 +1279,16 @@ static void __detach_device(struct prote /* * If we run in passthrough mode the device must be assigned to the - * passthrough domain if it is detached from any other domain + * passthrough domain if it is detached from any other domain. + * Make sure we can deassign from the pt_domain itself. */ - if (iommu_pass_through) { - struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; + if (iommu_pass_through && domain != pt_domain) __attach_device(iommu, pt_domain, devid); - } + + iommu_queue_inv_dev_entry(iommu, devid); + if (domain->dev_cnt == 0) + iommu_flush_tlb_pde(iommu, domain->id); + iommu_completion_wait(iommu); } /* @@ -1273,10 +1326,6 @@ static int device_change_notifier(struct domain = domain_for_device(devid); - if (domain && !dma_ops_domain(domain)) - WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " - "to a non-dma-ops domain\n", dev_name(dev)); - switch (action) { case BUS_NOTIFY_UNBOUND_DRIVER: if (!domain) @@ -1286,6 +1335,12 @@ static int device_change_notifier(struct detach_device(domain, devid); break; case BUS_NOTIFY_ADD_DEVICE: + + if (iommu_pass_through) { + attach_device(iommu, pt_domain, devid); + break; + } + /* allocate a protection domain if a device is added */ dma_domain = find_protection_domain(devid); if (dma_domain) @@ -1299,6 +1354,8 @@ static int device_change_notifier(struct list_add_tail(&dma_domain->list, &iommu_pd_list); spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + dev->archdata.dma_ops = &amd_iommu_dma_ops; + break; default: goto out; @@ -1315,6 +1372,11 @@ static struct notifier_block device_nb = .notifier_call = device_change_notifier, }; +void amd_iommu_init_notifier(void) +{ + bus_register_notifier(&pci_bus_type, &device_nb); +} + /***************************************************************************** * * The next functions belong to the dma_ops mapping/unmapping code. @@ -1466,18 +1528,22 @@ static bool increase_address_space(struc static u64 *alloc_pte(struct protection_domain *domain, unsigned long address, - int end_lvl, + unsigned long page_size, u64 **pte_page, gfp_t gfp) { + int level, end_lvl; u64 *pte, *page; - int level; + + BUG_ON(!is_power_of_2(page_size)); while (address > PM_LEVEL_SIZE(domain->mode)) increase_address_space(domain, gfp); - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + address = PAGE_SIZE_ALIGN(address, page_size); + end_lvl = PAGE_SIZE_LEVEL(page_size); while (level > end_lvl) { if (!IOMMU_PTE_PRESENT(*pte)) { @@ -1487,6 +1553,10 @@ static u64 *alloc_pte(struct protection_ *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); } + /* No level skipping support yet */ + if (PM_PTE_LEVEL(*pte) != level) + return NULL; + level -= 1; pte = IOMMU_PTE_PAGE(*pte); @@ -1515,7 +1585,7 @@ static u64* dma_ops_get_pte(struct dma_o pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { - pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, + pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page, GFP_ATOMIC); aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; } else @@ -1687,6 +1757,7 @@ static void __unmap_single(struct amd_io size_t size, int dir) { + dma_addr_t flush_addr; dma_addr_t i, start; unsigned int pages; @@ -1694,6 +1765,7 @@ static void __unmap_single(struct amd_io (dma_addr + size > dma_dom->aperture_size)) return; + flush_addr = dma_addr; pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); dma_addr &= PAGE_MASK; start = dma_addr; @@ -1708,7 +1780,7 @@ static void __unmap_single(struct amd_io dma_ops_free_addresses(dma_dom, dma_addr, pages); if (amd_iommu_unmap_flush || dma_dom->need_flush) { - iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); + iommu_flush_pages(iommu, dma_dom->domain.id, flush_addr, size); dma_dom->need_flush = false; } } @@ -2047,10 +2119,10 @@ static void prealloc_protection_domains( struct pci_dev *dev = NULL; struct dma_ops_domain *dma_dom; struct amd_iommu *iommu; - u16 devid; + u16 devid, __devid; while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - devid = calc_devid(dev->bus->number, dev->devfn); + __devid = devid = calc_devid(dev->bus->number, dev->devfn); if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; @@ -2065,6 +2137,10 @@ static void prealloc_protection_domains( init_unity_mappings_for_device(dma_dom, devid); dma_dom->target_dev = devid; + attach_device(iommu, &dma_dom->domain, devid); + if (__devid != devid) + attach_device(iommu, &dma_dom->domain, __devid); + list_add_tail(&dma_dom->list, &iommu_pd_list); } } @@ -2079,13 +2155,36 @@ static struct dma_map_ops amd_iommu_dma_ .dma_supported = amd_iommu_dma_supported, }; +static unsigned device_dma_ops_init(void) +{ + struct pci_dev *pdev = NULL; + unsigned unhandled = 0; + + for_each_pci_dev(pdev) { + if (!check_device(&pdev->dev)) { + unhandled += 1; + continue; + } + + pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops; + } + + return unhandled; +} + /* * The function which clues the AMD IOMMU driver into dma_ops. */ + +void __init amd_iommu_init_api(void) +{ + register_iommu(&amd_iommu_ops); +} + int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; - int ret; + int ret, unhandled; /* * first allocate a default protection domain for every IOMMU we @@ -2112,17 +2211,13 @@ int __init amd_iommu_init_dma_ops(void) iommu_detected = 1; force_iommu = 1; bad_dma_address = 0; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif /* Make the driver finally visible to the drivers */ - dma_ops = &amd_iommu_dma_ops; - - register_iommu(&amd_iommu_ops); - - bus_register_notifier(&pci_bus_type, &device_nb); + unhandled = device_dma_ops_init(); + if (unhandled && max_pfn > MAX_DMA32_PFN) { + /* There are unhandled devices - initialize swiotlb for them */ + swiotlb = 1; + } amd_iommu_stats_init(); @@ -2231,9 +2326,7 @@ static void amd_iommu_domain_destroy(str free_pagetable(domain); - domain_id_free(domain->id); - - kfree(domain); + protection_domain_free(domain); dom->priv = NULL; } @@ -2280,7 +2373,7 @@ static int amd_iommu_attach_device(struc devid = calc_devid(pdev->bus->number, pdev->devfn); - if (devid >= amd_iommu_last_bdf || + if (devid > amd_iommu_last_bdf || devid != amd_iommu_alias_table[devid]) return -EINVAL; @@ -2317,7 +2410,7 @@ static int amd_iommu_map_range(struct io paddr &= PAGE_MASK; for (i = 0; i < npages; ++i) { - ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); + ret = iommu_map_page(domain, iova, paddr, prot, PAGE_SIZE); if (ret) return ret; @@ -2375,8 +2468,8 @@ static struct iommu_ops amd_iommu_ops = .domain_destroy = amd_iommu_domain_destroy, .attach_dev = amd_iommu_attach_device, .detach_dev = amd_iommu_detach_device, - .map = amd_iommu_map_range, - .unmap = amd_iommu_unmap_range, + .map_range = amd_iommu_map_range, + .unmap_range = amd_iommu_unmap_range, .iova_to_phys = amd_iommu_iova_to_phys, .domain_has_cap = amd_iommu_domain_has_cap, }; diff -uprN linux-2.6.32/arch/x86/kernel/amd_iommu_init.c linux-2.6.32.ovz/arch/x86/kernel/amd_iommu_init.c --- linux-2.6.32/arch/x86/kernel/amd_iommu_init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/amd_iommu_init.c 2015-03-10 13:23:55.000000000 -0700 @@ -29,6 +29,7 @@ #include #include #include +#include /* * definitions for the ACPI scanning code @@ -119,6 +120,10 @@ bool amd_iommu_dump; static int __initdata amd_iommu_detected; +/* original values for gart fallback on initialization failure */ +static int __initdata __gart_iommu_aperture_disabled; +static int __initdata __gart_iommu_aperture; + u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings @@ -130,12 +135,22 @@ bool amd_iommu_isolate = true; /* if tr enabled */ #endif +static bool amd_iommu_force_nopt; /* force isolation (no-pt) mode + of iommu */ + +static bool amd_iommu_enable; /* enable AMD iommu */ + bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ /* + * Set to true if ACPI table parsing and hardware intialization went properly + */ +static bool amd_iommu_initialized; + +/* * Pointer to the device table which is shared by all AMD IOMMUs * it is indexed by the PCI device id or the HT unit id and contains * information about the domain the device belongs to as well as the @@ -186,6 +201,39 @@ static inline unsigned long tbl_size(int return 1UL << shift; } +/* Access to l1 and l2 indexed register spaces */ + +static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); + pci_read_config_dword(iommu->dev, 0xfc, &val); + return val; +} + +static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31)); + pci_write_config_dword(iommu->dev, 0xfc, val); + pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16)); +} + +static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address) +{ + u32 val; + + pci_write_config_dword(iommu->dev, 0xf0, address); + pci_read_config_dword(iommu->dev, 0xf4, &val); + return val; +} + +static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val) +{ + pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8)); + pci_write_config_dword(iommu->dev, 0xf4, val); +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -279,8 +327,12 @@ static u8 * __init iommu_map_mmio_space( { u8 *ret; - if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) + if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) { + pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n", + address); + pr_err("AMD-Vi: This is a BIOS bug. Please complain to your hardware vendor\n"); return NULL; + } ret = ioremap_nocache(address, MMIO_REGION_LENGTH); if (ret != NULL) @@ -429,7 +481,7 @@ static u8 * __init alloc_command_buffer( if (cmd_buf == NULL) return NULL; - iommu->cmd_buf_size = CMD_BUFFER_SIZE; + iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED; return cmd_buf; } @@ -465,12 +517,13 @@ static void iommu_enable_command_buffer( &entry, sizeof(entry)); amd_iommu_reset_cmd_buffer(iommu); + iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED); } static void __init free_command_buffer(struct amd_iommu *iommu) { free_pages((unsigned long)iommu->cmd_buf, - get_order(iommu->cmd_buf_size)); + get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED))); } /* allocates the memory where the IOMMU will log its events to */ @@ -604,6 +657,7 @@ static void __init init_iommu_from_pci(s { int cap_ptr = iommu->cap_ptr; u32 range, misc; + int i, j; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, &iommu->cap); @@ -617,6 +671,30 @@ static void __init init_iommu_from_pci(s iommu->last_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_LD(range)); iommu->evt_msi_num = MMIO_MSI_NUM(misc); + + if (!is_rd890_iommu(iommu->dev)) + return; + + /* + * Some rd890 systems may not be fully reconfigured by the BIOS, so + * it's necessary for us to store this information so it can be + * reprogrammed on resume + */ + + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4, + &iommu->stored_addr_lo); + pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8, + &iommu->stored_addr_hi); + + /* Low bit locks writes to configuration space */ + iommu->stored_addr_lo &= ~1; + + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j); + + for (i = 0; i < 0x83; i++) + iommu->stored_l2[i] = iommu_read_l2(iommu, i); } /* @@ -628,35 +706,15 @@ static void __init init_iommu_from_acpi( { u8 *p = (u8 *)h; u8 *end = p, flags = 0; - u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; - u32 ext_flags = 0; + u16 devid = 0, devid_start = 0, devid_to = 0; + u32 dev_i, ext_flags = 0; bool alias = false; struct ivhd_entry *e; /* - * First set the recommended feature enable bits from ACPI - * into the IOMMU control registers + * First save the recommended feature enable bits from ACPI */ - h->flags & IVHD_FLAG_HT_TUN_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : - iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); - - h->flags & IVHD_FLAG_PASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_PASSPW_EN); - - h->flags & IVHD_FLAG_RESPASSPW_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : - iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); - - h->flags & IVHD_FLAG_ISOC_EN_MASK ? - iommu_feature_enable(iommu, CONTROL_ISOC_EN) : - iommu_feature_disable(iommu, CONTROL_ISOC_EN); - - /* - * make IOMMU memory accesses cache coherent - */ - iommu_feature_enable(iommu, CONTROL_COHERENT_EN); + iommu->acpi_flags = h->flags; /* * Done. Now parse the device entries @@ -804,7 +862,7 @@ static void __init init_iommu_from_acpi( /* Initializes the device->iommu mapping for the driver */ static int __init init_iommu_devices(struct amd_iommu *iommu) { - u16 i; + u32 i; for (i = iommu->first_device; i <= iommu->last_device; ++i) set_iommu_for_device(iommu, i); @@ -847,6 +905,9 @@ static int __init init_iommu_one(struct if (!iommu->dev) return 1; + iommu->root_pdev = pci_get_bus_and_slot(iommu->dev->bus->number, + PCI_DEVFN(0, 0)); + iommu->cap_ptr = h->cap_ptr; iommu->pci_seg = h->pci_seg; iommu->mmio_phys = h->mmio_phys; @@ -913,6 +974,8 @@ static int __init init_iommu_all(struct } WARN_ON(p != end); + amd_iommu_initialized = true; + return 0; } @@ -925,12 +988,13 @@ static int __init init_iommu_all(struct * ****************************************************************************/ -static int __init iommu_setup_msi(struct amd_iommu *iommu) +static int iommu_setup_msi(struct amd_iommu *iommu) { int r; - if (pci_enable_msi(iommu->dev)) - return 1; + r = pci_enable_msi(iommu->dev); + if (r) + return r; r = request_irq(iommu->dev->irq, amd_iommu_int_handler, IRQF_SAMPLE_RANDOM, @@ -939,24 +1003,33 @@ static int __init iommu_setup_msi(struct if (r) { pci_disable_msi(iommu->dev); - return 1; + return r; } iommu->int_enabled = true; - iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); return 0; } static int iommu_init_msi(struct amd_iommu *iommu) { + int ret; + if (iommu->int_enabled) - return 0; + goto enable_faults; if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) - return iommu_setup_msi(iommu); + ret = iommu_setup_msi(iommu); + else + ret = -ENODEV; - return 1; + if (ret) + return ret; + +enable_faults: + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); + + return 0; } /**************************************************************************** @@ -1074,7 +1147,7 @@ static int __init init_memory_definition */ static void init_device_table(void) { - u16 devid; + u32 devid; for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { set_dev_entry_bit(devid, DEV_ENTRY_VALID); @@ -1082,6 +1155,73 @@ static void init_device_table(void) } } +static void iommu_init_flags(struct amd_iommu *iommu) +{ + iommu->acpi_flags & IVHD_FLAG_HT_TUN_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_HT_TUN_EN) : + iommu_feature_disable(iommu, CONTROL_HT_TUN_EN); + + iommu->acpi_flags & IVHD_FLAG_PASSPW_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_PASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_PASSPW_EN); + + iommu->acpi_flags & IVHD_FLAG_RESPASSPW_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_RESPASSPW_EN) : + iommu_feature_disable(iommu, CONTROL_RESPASSPW_EN); + + iommu->acpi_flags & IVHD_FLAG_ISOC_EN_MASK ? + iommu_feature_enable(iommu, CONTROL_ISOC_EN) : + iommu_feature_disable(iommu, CONTROL_ISOC_EN); + + /* + * make IOMMU memory accesses cache coherent + */ + iommu_feature_enable(iommu, CONTROL_COHERENT_EN); +} + +static void iommu_apply_resume_quirks(struct amd_iommu *iommu) +{ + int i, j; + u32 ioc_feature_control; + struct pci_dev *pdev = iommu->root_pdev; + + /* RD890 BIOSes may not have completely reconfigured the iommu */ + if (!is_rd890_iommu(iommu->dev) || !pdev) + return; + + /* + * First, we need to ensure that the iommu is enabled. This is + * controlled by a register in the northbridge + */ + + /* Select Northbridge indirect register 0x75 and enable writing */ + pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7)); + pci_read_config_dword(pdev, 0x64, &ioc_feature_control); + + /* Enable the iommu */ + if (!(ioc_feature_control & 0x1)) + pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1); + + /* Restore the iommu BAR */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo); + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8, + iommu->stored_addr_hi); + + /* Restore the l1 indirect regs for each of the 6 l1s */ + for (i = 0; i < 6; i++) + for (j = 0; j < 0x12; j++) + iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]); + + /* Restore the l2 indirect regs */ + for (i = 0; i < 0x83; i++) + iommu_write_l2(iommu, i, iommu->stored_l2[i]); + + /* Lock PCI setup registers */ + pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4, + iommu->stored_addr_lo | 1); +} + /* * This function finally enables all IOMMUs found in the system after * they have been initialized @@ -1092,6 +1232,7 @@ static void enable_iommus(void) for_each_iommu(iommu) { iommu_disable(iommu); + iommu_init_flags(iommu); iommu_set_device_table(iommu); iommu_enable_command_buffer(iommu); iommu_enable_event_buffer(iommu); @@ -1116,6 +1257,11 @@ static void disable_iommus(void) static int amd_iommu_resume(struct sys_device *dev) { + struct amd_iommu *iommu; + + for_each_iommu(iommu) + iommu_apply_resume_quirks(iommu); + /* re-load the hardware */ enable_iommus(); @@ -1240,9 +1386,6 @@ int __init amd_iommu_init(void) if (amd_iommu_pd_alloc_bitmap == NULL) goto free; - /* init the device table */ - init_device_table(); - /* * let all alias entries point to itself */ @@ -1263,6 +1406,9 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_iommu_all) != 0) goto free; + if (!amd_iommu_initialized) + goto free; + if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; @@ -1274,14 +1420,23 @@ int __init amd_iommu_init(void) if (ret) goto free; + enable_iommus(); + if (iommu_pass_through) ret = amd_iommu_init_passthrough(); else ret = amd_iommu_init_dma_ops(); + if (ret) - goto free; + goto disable; - enable_iommus(); + init_device_table(); + amd_iommu_flush_all_devices(); + amd_iommu_flush_all_domains(); + + amd_iommu_init_api(); + + amd_iommu_init_notifier(); if (iommu_pass_through) goto out; @@ -1300,6 +1455,8 @@ int __init amd_iommu_init(void) out: return ret; +disable: + disable_iommus(); free: free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, get_order(MAX_DOMAIN_ID/8)); @@ -1320,6 +1477,15 @@ free: free_unity_maps(); +#ifdef CONFIG_GART_IOMMU + /* + * We failed to initialize the AMD IOMMU - try fallback to GART + * if possible. + */ + gart_iommu_aperture_disabled = __gart_iommu_aperture_disabled; + gart_iommu_aperture = __gart_iommu_aperture; +#endif + goto out; } @@ -1345,13 +1511,34 @@ void __init amd_iommu_detect(void) if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) return; + if (!amd_iommu_enable) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + printk(KERN_INFO "AMD-Vi disabled by default: pass amd_iommu=on to enable\n"); + return; + } + if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; #ifdef CONFIG_GART_IOMMU + __gart_iommu_aperture_disabled = gart_iommu_aperture_disabled; + __gart_iommu_aperture = gart_iommu_aperture; gart_iommu_aperture_disabled = 1; gart_iommu_aperture = 0; #endif + /* Make sure ACS will be enabled */ + pci_request_acs(); + + /* + * The AMD IOMMU driver forces the passthrough mode by default + * for performance reasons. Since we need a way to switch back + * to the isolation mode we reuse amd_iommu={isolate|share} for + * that. + */ + iommu_pass_through = amd_iommu_force_nopt ? 0 : 1; + + if (iommu_pass_through && max_pfn > MAX_DMA32_PFN) + swiotlb = 1; } } @@ -1372,12 +1559,18 @@ static int __init parse_amd_iommu_dump(c static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strncmp(str, "isolate", 7) == 0) + if (strncmp(str, "isolate", 7) == 0) { amd_iommu_isolate = true; - if (strncmp(str, "share", 5) == 0) + amd_iommu_force_nopt = true; + } + if (strncmp(str, "share", 5) == 0) { amd_iommu_isolate = false; + amd_iommu_force_nopt = true; + } if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; + if (strncmp(str, "on", 2) == 0) + amd_iommu_enable = true; } return 1; diff -uprN linux-2.6.32/arch/x86/kernel/amd_nb.c linux-2.6.32.ovz/arch/x86/kernel/amd_nb.c --- linux-2.6.32/arch/x86/kernel/amd_nb.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/amd_nb.c 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,287 @@ +/* + * Shared support code for AMD K8 northbridges and derivates. + * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. + */ +#include +#include +#include +#include +#include +#include +#include + +static u32 *flush_words; + +const struct pci_device_id amd_nb_misc_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, + {} +}; +EXPORT_SYMBOL(amd_nb_misc_ids); + +static struct pci_device_id amd_nb_link_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, + {} +}; + +const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { + { 0x00, 0x18, 0x20 }, + { 0xff, 0x00, 0x20 }, + { 0xfe, 0x00, 0x20 }, + { } +}; + +struct amd_northbridge_info amd_northbridges; +EXPORT_SYMBOL(amd_northbridges); + +static struct pci_dev *next_northbridge(struct pci_dev *dev, + const struct pci_device_id *ids) +{ + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(ids, dev)); + return dev; +} + +int amd_cache_northbridges(void) +{ + u16 i = 0; + struct amd_northbridge *nb; + struct pci_dev *misc, *link; + + if (amd_nb_num()) + return 0; + + misc = NULL; + while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL) + i++; + + if (i == 0) + return 0; + + nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + if (!nb) + return -ENOMEM; + + amd_northbridges.nb = nb; + amd_northbridges.num = i; + + link = misc = NULL; + for (i = 0; i != amd_nb_num(); i++) { + node_to_amd_nb(i)->misc = misc = + next_northbridge(misc, amd_nb_misc_ids); + node_to_amd_nb(i)->link = link = + next_northbridge(link, amd_nb_link_ids); + } + + if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 || + boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_GART; + + /* + * Some CPU families support L3 Cache Index Disable. There are some + * limitations because of E382 and E388 on family 0x10. + */ + if (boot_cpu_data.x86 == 0x10 && + boot_cpu_data.x86_model >= 0x8 && + (boot_cpu_data.x86_model > 0x9 || + boot_cpu_data.x86_mask >= 0x1)) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE; + + /* L3 cache partitioning is supported on family 0x15 */ + if (boot_cpu_data.x86 == 0x15) + amd_northbridges.flags |= AMD_NB_L3_PARTITIONING; + + return 0; +} +EXPORT_SYMBOL_GPL(amd_cache_northbridges); + +/* + * Ignores subdevice/subvendor but as far as I can figure out + * they're useless anyways + */ +bool __init early_is_amd_nb(u32 device) +{ + const struct pci_device_id *id; + u32 vendor = device & 0xffff; + + device >>= 16; + for (id = amd_nb_misc_ids; id->vendor; id++) + if (vendor == id->vendor && device == id->device) + return true; + return false; +} + +struct resource *amd_get_mmconfig_range(struct resource *res) +{ + u32 address; + u64 base, msr; + unsigned segn_busn_bits; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return NULL; + + /* assume all cpus from fam10h have mmconfig */ + if (boot_cpu_data.x86 < 0x10) + return NULL; + + address = MSR_FAM10H_MMIO_CONF_BASE; + rdmsrl(address, msr); + + /* mmconfig is not enabled */ + if (!(msr & FAM10H_MMIO_CONF_ENABLE)) + return NULL; + + base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & + FAM10H_MMIO_CONF_BUSRANGE_MASK; + + res->flags = IORESOURCE_MEM; + res->start = base; + res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return res; +} + +int amd_get_subcaches(int cpu) +{ + struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; + unsigned int mask; + int cuid = 0; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return 0; + + pci_read_config_dword(link, 0x1d4, &mask); + +#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +#endif + return (mask >> (4 * cuid)) & 0xf; +} + +int amd_set_subcaches(int cpu, int mask) +{ + static unsigned int reset, ban; + struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); + unsigned int reg; + int cuid = 0; + + if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) + return -EINVAL; + + /* if necessary, collect reset state of L3 partitioning and BAN mode */ + if (reset == 0) { + pci_read_config_dword(nb->link, 0x1d4, &reset); + pci_read_config_dword(nb->misc, 0x1b8, &ban); + ban &= 0x180000; + } + + /* deactivate BAN mode if any subcaches are to be disabled */ + if (mask != 0xf) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); + } + +#ifdef CONFIG_SMP + cuid = cpu_data(cpu).compute_unit_id; +#endif + mask <<= 4 * cuid; + mask |= (0xf ^ (1 << cuid)) << 26; + + pci_write_config_dword(nb->link, 0x1d4, mask); + + /* reset BAN mode if L3 partitioning returned to reset state */ + pci_read_config_dword(nb->link, 0x1d4, ®); + if (reg == reset) { + pci_read_config_dword(nb->misc, 0x1b8, ®); + reg &= ~0x180000; + pci_write_config_dword(nb->misc, 0x1b8, reg | ban); + } + + return 0; +} + +static int amd_cache_gart(void) +{ + u16 i; + + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + + flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + if (!flush_words) { + amd_northbridges.flags &= ~AMD_NB_GART; + return -ENOMEM; + } + + for (i = 0; i != amd_nb_num(); i++) + pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, + &flush_words[i]); + + return 0; +} + +void amd_flush_garts(void) +{ + int flushed, i; + unsigned long flags; + static DEFINE_SPINLOCK(gart_lock); + + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + /* Avoid races between AGP and IOMMU. In theory it's not needed + but I'm not sure if the hardware won't lose flush requests + when another is pending. This whole thing is so expensive anyways + that it doesn't matter to serialize more. -AK */ + spin_lock_irqsave(&gart_lock, flags); + flushed = 0; + for (i = 0; i < amd_nb_num(); i++) { + pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, + flush_words[i] | 1); + flushed++; + } + for (i = 0; i < amd_nb_num(); i++) { + u32 w; + /* Make sure the hardware actually executed the flush*/ + for (;;) { + pci_read_config_dword(node_to_amd_nb(i)->misc, + 0x9c, &w); + if (!(w & 1)) + break; + cpu_relax(); + } + } + spin_unlock_irqrestore(&gart_lock, flags); + if (!flushed) + printk("nothing to flush?\n"); +} +EXPORT_SYMBOL_GPL(amd_flush_garts); + +static __init int init_amd_nbs(void) +{ + int err = 0; + + err = amd_cache_northbridges(); + + if (err < 0) + printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n"); + + if (amd_cache_gart() < 0) + printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, " + "GART support disabled.\n"); + + return err; +} + +/* This has to go after the PCI subsystem */ +fs_initcall(init_amd_nbs); diff -uprN linux-2.6.32/arch/x86/kernel/aperture_64.c linux-2.6.32.ovz/arch/x86/kernel/aperture_64.c --- linux-2.6.32/arch/x86/kernel/aperture_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/aperture_64.c 2015-03-10 13:24:08.000000000 -0700 @@ -27,7 +27,7 @@ #include #include #include -#include +#include int gart_iommu_aperture; int gart_iommu_aperture_disabled __initdata; @@ -38,18 +38,6 @@ int fallback_aper_force __initdata; int fix_aperture __initdata = 1; -struct bus_dev_range { - int bus; - int dev_base; - int dev_limit; -}; - -static struct bus_dev_range bus_dev_ranges[] __initdata = { - { 0x00, 0x18, 0x20}, - { 0xff, 0x00, 0x20}, - { 0xfe, 0x00, 0x20} -}; - static struct resource gart_resource = { .name = "GART", .flags = IORESOURCE_MEM, @@ -205,7 +193,7 @@ static u32 __init read_agp(int bus, int * Do an PCI bus scan by hand because we're running before the PCI * subsystem. * - * All K8 AGP bridges are AGPv3 compliant, so we can do this scan + * All AMD AGP bridges are AGPv3 compliant, so we can do this scan * generically. It's probably overkill to always scan all slots because * the AGP bridges should be always an own bus on the HT hierarchy, * but do it here for future safety. @@ -290,16 +278,16 @@ void __init early_gart_iommu_check(void) /* This is mostly duplicate of iommu_hole_init */ fix = 0; - for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus; int dev_base, dev_limit; - bus = bus_dev_ranges[i].bus; - dev_base = bus_dev_ranges[i].dev_base; - dev_limit = bus_dev_ranges[i].dev_limit; + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -345,16 +333,16 @@ void __init early_gart_iommu_check(void) return; /* different nodes have different setting, disable them all at first*/ - for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus; int dev_base, dev_limit; - bus = bus_dev_ranges[i].bus; - dev_base = bus_dev_ranges[i].dev_base; - dev_limit = bus_dev_ranges[i].dev_limit; + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); @@ -386,22 +374,35 @@ void __init gart_iommu_hole_init(void) fix = 0; node = 0; - for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { int bus; int dev_base, dev_limit; + u32 ctl; - bus = bus_dev_ranges[i].bus; - dev_base = bus_dev_ranges[i].dev_base; - dev_limit = bus_dev_ranges[i].dev_limit; + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; iommu_detected = 1; gart_iommu_aperture = 1; - aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; + ctl = read_pci_config(bus, slot, 3, + AMD64_GARTAPERTURECTL); + + /* + * Before we do anything else disable the GART. It may + * still be enabled if we boot into a crash-kernel here. + * Reconfiguring the GART while it is enabled could have + * unknown side-effects. + */ + ctl &= ~GARTEN; + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); + + aper_order = (ctl >> 1) & 7; aper_size = (32 * 1024 * 1024) << aper_order; aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; aper_base <<= 25; @@ -488,21 +489,23 @@ out: } /* Fix up the north bridges */ - for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { - int bus; - int dev_base, dev_limit; + for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) { + int bus, dev_base, dev_limit; - bus = bus_dev_ranges[i].bus; - dev_base = bus_dev_ranges[i].dev_base; - dev_limit = bus_dev_ranges[i].dev_limit; + /* + * Don't enable translation yet but enable GART IO and CPU + * accesses and set DISTLBWALKPRB since GART table memory is UC. + */ + u32 ctl = aper_order << 1; + + bus = amd_nb_bus_dev_ranges[i].bus; + dev_base = amd_nb_bus_dev_ranges[i].dev_base; + dev_limit = amd_nb_bus_dev_ranges[i].dev_limit; for (slot = dev_base; slot < dev_limit; slot++) { - if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) + if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00))) continue; - /* Don't enable translation yet. That is done later. - Assume this BIOS didn't initialise the GART so - just overwrite all previous bits */ - write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1); + write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); } } diff -uprN linux-2.6.32/arch/x86/kernel/apic/apic.c linux-2.6.32.ovz/arch/x86/kernel/apic/apic.c --- linux-2.6.32/arch/x86/kernel/apic/apic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/apic.c 2015-03-10 13:24:13.000000000 -0700 @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -50,7 +51,9 @@ #include #include #include -#include +#include +#include +#include unsigned int num_processors; @@ -58,15 +61,10 @@ unsigned disabled_cpus __cpuinitdata; /* Processor that is doing the boot up */ unsigned int boot_cpu_physical_apicid = -1U; +EXPORT_SYMBOL_GPL(boot_cpu_physical_apicid); /* * The highest APIC ID seen during enumeration. - * - * On AMD, this determines the messaging protocol we can use: if all APIC IDs - * are in the 0 ... 7 range, then we can use logical addressing which - * has some performance advantages (better broadcasting). - * - * If there's an APIC ID above 8, we use physical addressing. */ unsigned int max_physical_apicid; @@ -76,6 +74,13 @@ unsigned int max_physical_apicid; physid_mask_t phys_cpu_present_map; /* + * Processor to be disabled specified by kernel parameter + * disable_cpu_apicid=, mostly used for the kdump 2nd kernel to + * avoid undefined behaviour caused by sending INIT from AP to BSP. + */ +static unsigned int disabled_cpu_apicid __read_mostly = BAD_APICID; + +/* * Map cpu index to physical APIC ID */ DEFINE_EARLY_PER_CPU(u16, x86_cpu_to_apicid, BAD_APICID); @@ -181,7 +186,7 @@ static struct resource lapic_resource = .flags = IORESOURCE_MEM | IORESOURCE_BUSY, }; -static unsigned int calibration_result; +unsigned int lapic_timer_frequency = 0; static int lapic_next_event(unsigned long delta, struct clock_event_device *evt); @@ -246,7 +251,7 @@ static int modern_apic(void) */ static void native_apic_write_dummy(u32 reg, u32 v) { - WARN_ON_ONCE((cpu_has_apic || !disable_apic)); + WARN_ON_ONCE(cpu_has_apic && !disable_apic); } static u32 native_apic_read_dummy(u32 reg) @@ -263,6 +268,7 @@ void apic_disable(void) { apic->read = native_apic_read_dummy; apic->write = native_apic_write_dummy; + apic->eoi_write = native_apic_write_dummy; } void native_apic_wait_icr_idle(void) @@ -289,8 +295,12 @@ u32 native_safe_apic_wait_icr_idle(void) void native_apic_icr_write(u32 low, u32 id) { + unsigned long flags; + + local_irq_save(flags); apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); + local_irq_restore(flags); } u64 native_apic_icr_read(void) @@ -390,38 +400,88 @@ static void __setup_APIC_LVTT(unsigned i } /* - * Setup extended LVT, AMD specific (K8, family 10h) + * Setup extended LVT, AMD specific * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. + * Software should use the LVT offsets the BIOS provides. The offsets + * are determined by the subsystems using it like those for MCE + * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts + * are supported. Beginning with family 10h at least 4 offsets are + * available. * - * If mask=1, the LVT entry does not generate interrupts while mask=0 - * enables the vector. See also the BKDGs. + * Since the offsets must be consistent for all cores, we keep track + * of the LVT offsets in software and reserve the offset for the same + * vector also to be used on other cores. An offset is freed by + * setting the entry to APIC_EILVT_MASKED. + * + * If the BIOS is right, there should be no conflicts. Otherwise a + * "[Firmware Bug]: ..." error message is generated. However, if + * software does not properly determines the offsets, it is not + * necessarily a BIOS bug. */ -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 +static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX]; -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new) { - unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - - apic_write(reg, v); + return (old & APIC_EILVT_MASKED) + || (new == APIC_EILVT_MASKED) + || ((new & ~APIC_EILVT_MASKED) == old); } -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +static unsigned int reserve_eilvt_offset(int offset, unsigned int new) { - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; + unsigned int rsvd; /* 0: uninitialized */ + + if (offset >= APIC_EILVT_NR_MAX) + return ~0; + + rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED; + do { + if (rsvd && + !eilvt_entry_is_changeable(rsvd, new)) + /* may not change if vectors are different */ + return rsvd; + rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new); + } while (rsvd != new); + + return new; } -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +/* + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. + */ + +int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask) { - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; + unsigned long reg = APIC_EILVTn(offset); + unsigned int new, old, reserved; + + new = (mask << 16) | (msg_type << 8) | vector; + old = apic_read(reg); + reserved = reserve_eilvt_offset(offset, new); + + if (reserved != new) { + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on another cpu\n", + smp_processor_id(), reg, offset, new, reserved); + return -EINVAL; + } + + if (!eilvt_entry_is_changeable(old, new)) { + pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for " + "vector 0x%x, but the register is already in use for " + "vector 0x%x on this cpu\n", + smp_processor_id(), reg, offset, new, old); + return -EBUSY; + } + + apic_write(reg, new); + + return 0; } -EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); +EXPORT_SYMBOL_GPL(setup_APIC_eilvt); /* * Program the next event, relative to now @@ -451,7 +511,7 @@ static void lapic_timer_setup(enum clock switch (mode) { case CLOCK_EVT_MODE_PERIODIC: case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, + __setup_APIC_LVTT(lapic_timer_frequency, mode != CLOCK_EVT_MODE_PERIODIC, 1); break; case CLOCK_EVT_MODE_UNUSED: @@ -602,7 +662,7 @@ calibrate_by_pmtimer(long deltapm, long res = (((u64)(*deltatsc)) * pm_100ms); do_div(res, deltapm); apic_printk(APIC_VERBOSE, "TSC delta adjusted to " - "PM-Timer: %lu (%ld) \n", + "PM-Timer: %lu (%ld)\n", (unsigned long)res, *deltatsc); *deltatsc = (long)res; } @@ -618,6 +678,25 @@ static int __init calibrate_APIC_clock(v long delta, deltatsc; int pm_referenced = 0; + /** + * check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. if so, we just fill + * in the clockevent structure and return. + */ + + if (lapic_timer_frequency) { + apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", + lapic_timer_frequency); + lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + return 0; + } + local_irq_disable(); /* Replace the global interrupt handler */ @@ -655,16 +734,16 @@ static int __init calibrate_APIC_clock(v lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, lapic_clockevent.shift); lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); - calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - calibration_result); + lapic_timer_frequency); if (cpu_has_tsc) { apic_printk(APIC_VERBOSE, "..... CPU clock speed is " @@ -675,13 +754,13 @@ static int __init calibrate_APIC_clock(v apic_printk(APIC_VERBOSE, "..... host bus clock speed is " "%u.%04u MHz.\n", - calibration_result / (1000000 / HZ), - calibration_result % (1000000 / HZ)); + lapic_timer_frequency / (1000000 / HZ), + lapic_timer_frequency % (1000000 / HZ)); /* * Do a sanity check on the APIC calibration result */ - if (calibration_result < (1000000 / HZ)) { + if (lapic_timer_frequency < (1000000 / HZ)) { local_irq_enable(); pr_warning("APIC frequency too slow, disabling apic timer\n"); return -1; @@ -834,17 +913,35 @@ void __irq_entry smp_apic_timer_interrup /* * NOTE! We'd better ACK the irq immediately, * because timer handling can be slow. + * + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. */ - ack_APIC_irq(); + entering_ack_irq(); + local_apic_timer_interrupt(); + irq_exit(); + + set_irq_regs(old_regs); +} + +void __irq_entry smp_trace_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + * * update_process_times() expects us to have done irq_enter(). * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ - exit_idle(); - irq_enter(); + entering_ack_irq(); + trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); - irq_exit(); + trace_local_timer_exit(LOCAL_TIMER_VECTOR); + exiting_irq(); set_irq_regs(old_regs); } @@ -941,7 +1038,7 @@ void disable_local_APIC(void) unsigned int value; /* APIC hasn't been mapped yet */ - if (!apic_phys) + if (!x2apic_mode && !apic_phys) return; clear_local_APIC(); @@ -1172,8 +1269,13 @@ static void __cpuinit lapic_setup_esr(vo */ void __cpuinit setup_local_APIC(void) { - unsigned int value; - int i, j; + unsigned int value, queued; + int i, j, acked = 0; + unsigned long long tsc = 0, ntsc; + long long max_loops = cpu_khz; + + if (cpu_has_tsc) + rdtscll(tsc); if (disable_apic) { arch_disable_smp_support(); @@ -1225,13 +1327,32 @@ void __cpuinit setup_local_APIC(void) * the interrupt. Hence a vector might get locked. It was noticed * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<= 0; i--) + queued |= apic_read(APIC_IRR + i*0x10); + + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1< 256) { + printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", + acked); + break; + } + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } while (queued && max_loops > 0); + WARN_ON(max_loops <= 0); /* * Now that we are all set up, enable the APIC @@ -1334,8 +1455,17 @@ void __cpuinit end_local_APIC_setup(void } #endif + nmi_watchdog_default(); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); + + /* + * Now that local APIC setup is completed for BP, configure the fault + * handling for interrupt remapping. + */ + if (!smp_processor_id() && intr_remapping_enabled) + enable_drhd_fault_handling(); + } #ifdef CONFIG_X86_X2APIC @@ -1356,7 +1486,7 @@ void enable_x2apic(void) rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { - pr_info("Enabling x2apic\n"); + printk_once(KERN_INFO "Enabling x2apic\n"); wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); } } @@ -1367,31 +1497,25 @@ int __init enable_IR(void) #ifdef CONFIG_INTR_REMAP if (!intr_remapping_supported()) { pr_debug("intr-remapping not supported\n"); - return 0; + return -1; } if (!x2apic_preenabled && skip_ioapic_setup) { pr_info("Skipped enabling intr-remap because of skipping " "io-apic setup\n"); - return 0; + return -1; } - if (enable_intr_remapping(x2apic_supported())) - return 0; - - pr_info("Enabled Interrupt-remapping\n"); - - return 1; - + return enable_intr_remapping(); #endif - return 0; + return -1; } void __init enable_IR_x2apic(void) { unsigned long flags; struct IO_APIC_route_entry **ioapic_entries = NULL; - int ret, x2apic_enabled = 0; + int ret = -1, x2apic_enabled = 0; int dmar_table_init_ret = 0; #ifdef CONFIG_INTR_REMAP @@ -1414,19 +1538,20 @@ void __init enable_IR_x2apic(void) } local_irq_save(flags); - mask_8259A(); + legacy_pic->mask_all(); mask_IO_APIC_setup(ioapic_entries); if (dmar_table_init_ret) - ret = 0; + ret = -1; else ret = enable_IR(); - if (!ret) { + if (ret < 0) { /* IR is required if there is APIC ID > 255 even when running * under KVM */ - if (max_physical_apicid > 255 || !kvm_para_available()) + if (max_physical_apicid > 255 || + !hypervisor_x2apic_available()) goto nox2apic; /* * without IR all CPUs can be addressed by IOAPIC/MSI @@ -1435,6 +1560,9 @@ void __init enable_IR_x2apic(void) x2apic_force_phys(); } + if (ret == IRQ_REMAP_XAPIC_MODE) + goto nox2apic; + x2apic_enabled = 1; if (x2apic_supported() && !x2apic_mode) { @@ -1444,22 +1572,24 @@ void __init enable_IR_x2apic(void) } nox2apic: - if (!ret) /* IR enabling failed */ + if (ret < 0) /* IR enabling failed */ restore_IO_APIC_setup(ioapic_entries); - unmask_8259A(); + legacy_pic->restore_mask(); local_irq_restore(flags); out: if (ioapic_entries) free_ioapic_entries(ioapic_entries); - if (x2apic_enabled) + if (x2apic_enabled || !x2apic_supported()) return; if (x2apic_preenabled) panic("x2apic: enabled by BIOS but kernel init failed."); - else if (cpu_has_x2apic) - pr_info("Not enabling x2apic, Intr-remapping init failed.\n"); + else if (ret == IRQ_REMAP_XAPIC_MODE) + pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); + else if (ret < 0) + pr_info("x2apic not enabled, IRQ remapping init failed\n"); } #ifdef CONFIG_X86_64 @@ -1605,7 +1735,7 @@ void __init init_apic_mappings(void) * acpi lapic path already maps that address in * acpi_register_lapic_address() */ - if (!acpi_lapic) + if (!acpi_lapic && !smp_found_config) set_fixmap_nocache(FIX_APIC_BASE, apic_phys); apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", @@ -1635,7 +1765,7 @@ void __init init_apic_mappings(void) * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ -int apic_version[MAX_APICS]; +int apic_version[MAX_LOCAL_APIC]; int __init APIC_init_uniprocessor(void) { @@ -1664,10 +1794,7 @@ int __init APIC_init_uniprocessor(void) } #endif - enable_IR_x2apic(); -#ifdef CONFIG_X86_64 default_setup_apic_routing(); -#endif verify_local_APIC(); connect_bsp_APIC(); @@ -1724,12 +1851,10 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -void smp_spurious_interrupt(struct pt_regs *regs) +static inline void __smp_spurious_interrupt(void) { u32 v; - exit_idle(); - irq_enter(); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1744,39 +1869,78 @@ void smp_spurious_interrupt(struct pt_re /* see sw-dev-man vol 3, chapter 7.4.13.5 */ pr_info("spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); - irq_exit(); +} + +void smp_spurious_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_spurious_interrupt(); + exiting_irq(); +} + +void smp_trace_spurious_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_spurious_apic_entry(SPURIOUS_APIC_VECTOR); + __smp_spurious_interrupt(); + trace_spurious_apic_exit(SPURIOUS_APIC_VECTOR); + exiting_irq(); } /* * This interrupt should never happen with our APIC/SMP architecture */ -void smp_error_interrupt(struct pt_regs *regs) +static inline void __smp_error_interrupt(struct pt_regs *regs) { - u32 v, v1; + u32 v0, v1; + u32 i = 0; + static const char * const error_interrupt_reason[] = { + "Send CS error", /* APIC Error Bit 0 */ + "Receive CS error", /* APIC Error Bit 1 */ + "Send accept error", /* APIC Error Bit 2 */ + "Receive accept error", /* APIC Error Bit 3 */ + "Redirectable IPI", /* APIC Error Bit 4 */ + "Send illegal vector", /* APIC Error Bit 5 */ + "Received illegal vector", /* APIC Error Bit 6 */ + "Illegal register address", /* APIC Error Bit 7 */ + }; - exit_idle(); - irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); + v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); v1 = apic_read(APIC_ESR); ack_APIC_irq(); atomic_inc(&irq_err_count); - /* - * Here is what the APIC error bits mean: - * 0: Send CS error - * 1: Receive CS error - * 2: Send accept error - * 3: Receive accept error - * 4: Reserved - * 5: Send illegal vector - * 6: Received illegal vector - * 7: Illegal register address - */ - pr_debug("APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); + apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)", + smp_processor_id(), v0 , v1); + + v1 = v1 & 0xff; + while (v1) { + if (v1 & 0x1) + apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]); + i++; + v1 >>= 1; + }; + + apic_printk(APIC_DEBUG, KERN_CONT "\n"); + +} + +void smp_error_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_error_interrupt(regs); + exiting_irq(); +} + +void smp_trace_error_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_error_apic_entry(ERROR_APIC_VECTOR); + __smp_error_interrupt(regs); + trace_error_apic_exit(ERROR_APIC_VECTOR); + exiting_irq(); } /** @@ -1870,21 +2034,61 @@ void disconnect_bsp_APIC(int virt_wire_s void __cpuinit generic_processor_info(int apicid, int version) { - int cpu; + int cpu, max = nr_cpu_ids; + bool boot_cpu_detected = physid_isset(boot_cpu_physical_apicid, + phys_cpu_present_map); /* - * Validate version + * boot_cpu_physical_apicid is designed to have the apicid + * returned by read_apic_id(), i.e, the apicid of the + * currently booting-up processor. However, on some platforms, + * it is temporarily modified by the apicid reported as BSP + * through MP table. Concretely: + * + * - arch/x86/kernel/mpparse.c: MP_processor_info() + * - arch/x86/mm/amdtopology.c: amd_numa_init() + * - arch/x86/platform/visws/visws_quirks.c: MP_processor_info() + * + * This function is executed with the modified + * boot_cpu_physical_apicid. So, disabled_cpu_apicid kernel + * parameter doesn't work to disable APs on kdump 2nd kernel. + * + * Since fixing handling of boot_cpu_physical_apicid requires + * another discussion and tests on each platform, we leave it + * for now and here we use read_apic_id() directly in this + * function, generic_processor_info(). + */ + if (disabled_cpu_apicid != BAD_APICID && + disabled_cpu_apicid != read_apic_id() && + disabled_cpu_apicid == apicid) { + int thiscpu = num_processors + disabled_cpus; + + pr_warning("APIC: Disabling requested cpu." + " Processor %d/0x%x ignored.\n", + thiscpu, apicid); + + disabled_cpus++; + return ; + } + + /* + * If boot cpu has not been detected yet, then only allow upto + * nr_cpu_ids - 1 processors and keep one slot free for boot cpu */ - if (version == 0x0) { - pr_warning("BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; + if (!boot_cpu_detected && num_processors >= nr_cpu_ids - 1 && + apicid != boot_cpu_physical_apicid) { + int thiscpu = max + disabled_cpus - 1; + + pr_warning( + "ACPI: NR_CPUS/possible_cpus limit of %i almost" + " reached. Keeping one slot for boot cpu." + " Processor %d/0x%x ignored.\n", max, thiscpu, apicid); + + disabled_cpus++; + return; } - apic_version[apicid] = version; if (num_processors >= nr_cpu_ids) { - int max = nr_cpu_ids; int thiscpu = max + disabled_cpus; pr_warning( @@ -1896,36 +2100,36 @@ void __cpuinit generic_processor_info(in } num_processors++; - cpu = cpumask_next_zero(-1, cpu_present_mask); - - if (version != apic_version[boot_cpu_physical_apicid]) - WARN_ONCE(1, - "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n", - apic_version[boot_cpu_physical_apicid], cpu, version); - - physid_set(apicid, phys_cpu_present_map); if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. + * boot_cpu_init() already hold bit 0 in cpu_present_mask + * for BSP. */ cpu = 0; + } else + cpu = cpumask_next_zero(-1, cpu_present_mask); + + /* + * Validate version + */ + if (version == 0x0) { + pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n", + cpu, apicid); + version = 0x10; } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; + apic_version[apicid] = version; -#ifdef CONFIG_X86_32 - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - def_to_bigsmp = 1; - break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - def_to_bigsmp = 1; + if (version != apic_version[boot_cpu_physical_apicid]) { + pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n", + apic_version[boot_cpu_physical_apicid], cpu, version); } -#endif + + physid_set(apicid, phys_cpu_present_map); + if (apicid > max_physical_apicid) + max_physical_apicid = apicid; #if defined(CONFIG_SMP) || defined(CONFIG_X86_64) early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; @@ -1963,6 +2167,23 @@ int default_apicid_to_node(int logical_a #endif /* + * Override the generic EOI implementation with an optimized version. + * Only called during early boot when only one CPU is active and with + * interrupts disabled, so we know this does not race with actual APIC driver + * use. + */ +void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) +{ + struct apic **drv; + + for (drv = apic_probe; *drv; drv++) { + /* Should happen once for each apic */ + WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->eoi_write = eoi_write; + } +} + +/* * Power management */ #ifdef CONFIG_PM @@ -2056,7 +2277,7 @@ static int lapic_resume(struct sys_devic } mask_IO_APIC_setup(ioapic_entries); - mask_8259A(); + legacy_pic->mask_all(); } if (x2apic_mode) @@ -2100,7 +2321,7 @@ static int lapic_resume(struct sys_devic if (intr_remapping_enabled) { reenable_intr_remapping(x2apic_mode); - unmask_8259A(); + legacy_pic->restore_mask(); restore_IO_APIC_setup(ioapic_entries); free_ioapic_entries(ioapic_entries); } @@ -2343,3 +2564,12 @@ static int __init lapic_insert_resource( * that is using request_resource */ late_initcall(lapic_insert_resource); + +static int __init apic_set_disabled_cpu_apicid(char *arg) +{ + if (!arg || !get_option(&arg, &disabled_cpu_apicid)) + return -EINVAL; + + return 0; +} +early_param("disable_cpu_apicid", apic_set_disabled_cpu_apicid); diff -uprN linux-2.6.32/arch/x86/kernel/apic/apic_flat_64.c linux-2.6.32.ovz/arch/x86/kernel/apic/apic_flat_64.c --- linux-2.6.32/arch/x86/kernel/apic/apic_flat_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/apic_flat_64.c 2015-03-10 13:23:53.000000000 -0700 @@ -164,9 +164,14 @@ static int flat_phys_pkg_id(int initial_ return initial_apic_id >> index_msb; } +static int flat_probe(void) +{ + return 1; +} + struct apic apic_flat = { .name = "flat", - .probe = NULL, + .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, @@ -216,6 +221,7 @@ struct apic apic_flat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -223,7 +229,7 @@ struct apic apic_flat = { }; /* - * Physflat mode is used when there are more than 8 CPUs on a AMD system. + * Physflat mode is used when there are more than 8 CPUs on a system. * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ @@ -240,6 +246,11 @@ static int physflat_acpi_madt_oem_check( printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; } + + if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) { + printk(KERN_DEBUG "IBM Summit detected, will use apic physical"); + return 1; + } #endif return 0; @@ -306,16 +317,21 @@ physflat_cpu_mask_to_apicid_and(const st if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); + return per_cpu(x86_cpu_to_apicid, cpu); +} + +static int physflat_probe(void) +{ + if (apic == &apic_physflat || num_possible_cpus() > 8) + return 1; - return BAD_APICID; + return 0; } struct apic apic_physflat = { .name = "physical flat", - .probe = NULL, + .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, @@ -366,6 +382,7 @@ struct apic apic_physflat = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff -uprN linux-2.6.32/arch/x86/kernel/apic/bigsmp_32.c linux-2.6.32.ovz/arch/x86/kernel/apic/bigsmp_32.c --- linux-2.6.32/arch/x86/kernel/apic/bigsmp_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/bigsmp_32.c 2015-03-10 13:23:23.000000000 -0700 @@ -136,10 +136,7 @@ static unsigned int bigsmp_cpu_mask_to_a if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return bigsmp_cpu_to_logical_apicid(cpu); - - return BAD_APICID; + return bigsmp_cpu_to_logical_apicid(cpu); } static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) @@ -260,6 +257,7 @@ struct apic apic_bigsmp = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff -uprN linux-2.6.32/arch/x86/kernel/apic/es7000_32.c linux-2.6.32.ovz/arch/x86/kernel/apic/es7000_32.c --- linux-2.6.32/arch/x86/kernel/apic/es7000_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/es7000_32.c 2015-03-10 13:23:23.000000000 -0700 @@ -712,6 +712,7 @@ struct apic __refdata apic_es7000_cluste .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -775,6 +776,7 @@ struct apic __refdata apic_es7000 = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff -uprN linux-2.6.32/arch/x86/kernel/apic/hw_nmi.c linux-2.6.32.ovz/arch/x86/kernel/apic/hw_nmi.c --- linux-2.6.32/arch/x86/kernel/apic/hw_nmi.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/hw_nmi.c 2015-03-10 13:22:31.000000000 -0700 @@ -0,0 +1,105 @@ +/* + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + * + */ +#include + +#include +#include +#include +#include +#include +#include + +u64 hw_nmi_get_sample_period(void) +{ + return (u64)(cpu_khz) * 1000 * 60; +} + +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + +void arch_trigger_all_cpu_backtrace(void) +{ + int i; + + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpumask_empty(to_cpumask(backtrace_mask))) + break; + mdelay(1); + } +} + +static int __kprobes +arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + int cpu = smp_processor_id(); + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static DEFINE_SPINLOCK(lock); /* Serialize the printks */ + + spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + dump_stack(); + spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return NOTIFY_STOP; + } + + return NOTIFY_DONE; +} + +static __read_mostly struct notifier_block backtrace_notifier = { + .notifier_call = arch_trigger_all_cpu_backtrace_handler, + .next = NULL, + .priority = NMI_LOCAL_LOW_PRIOR, +}; + +static int __init register_trigger_all_cpu_backtrace(void) +{ + register_die_notifier(&backtrace_notifier); + return 0; +} +early_initcall(register_trigger_all_cpu_backtrace); + +/* STUB calls to mimic old nmi_watchdog behaviour */ +#if defined(CONFIG_X86_LOCAL_APIC) +unsigned int nmi_watchdog = NMI_NONE; +EXPORT_SYMBOL(nmi_watchdog); +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +#endif +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +EXPORT_SYMBOL(nmi_active); +void cpu_nmi_set_wd_enabled(void) { return; } +void stop_apic_nmi_watchdog(void *unused) { return; } +void setup_apic_nmi_watchdog(void *unused) { return; } +int __init check_nmi_watchdog(void) { return 0; } +void __cpuinit nmi_watchdog_default(void) { return; } diff -uprN linux-2.6.32/arch/x86/kernel/apic/io_apic.c linux-2.6.32.ovz/arch/x86/kernel/apic/io_apic.c --- linux-2.6.32/arch/x86/kernel/apic/io_apic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/io_apic.c 2015-06-23 04:16:16.000000000 -0700 @@ -60,8 +60,6 @@ #include #include #include -#include -#include #include @@ -90,14 +88,15 @@ int nr_ioapics; /* IO APIC gsi routing info */ struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; +/* The one past the highest gsi number used */ +u32 gsi_top; + /* MP IRQ source entries */ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; -/* Number of legacy interrupts */ -static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY; /* GSI interrupts */ static int nr_irqs_gsi = NR_IRQS_LEGACY; @@ -140,20 +139,6 @@ static struct irq_pin_list *get_one_free return pin; } -/* - * This is performance-critical, we want to do it O(1) - * - * Most irqs are mapped 1:1 with pins. - */ -struct irq_cfg { - struct irq_pin_list *irq_2_pin; - cpumask_var_t domain; - cpumask_var_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ #ifdef CONFIG_SPARSE_IRQ static struct irq_cfg irq_cfgx[] = { @@ -180,7 +165,6 @@ static struct irq_cfg irq_cfgx[NR_IRQS] void __init io_apic_disable_legacy(void) { - nr_legacy_irqs = 0; nr_irqs_gsi = 0; } @@ -201,7 +185,7 @@ int __init arch_early_irq_init(void) desc->chip_data = &cfg[i]; zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); - if (i < nr_legacy_irqs) + if (i < legacy_pic->nr_legacy_irqs) cpumask_setall(cfg[i].domain); } @@ -209,7 +193,7 @@ int __init arch_early_irq_init(void) } #ifdef CONFIG_SPARSE_IRQ -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg = NULL; struct irq_desc *desc; @@ -361,7 +345,7 @@ void arch_free_chip_data(struct irq_desc /* end for move_irq_desc */ #else -static struct irq_cfg *irq_cfg(unsigned int irq) +struct irq_cfg *irq_cfg(unsigned int irq) { return irq < nr_irqs ? irq_cfgx + irq : NULL; } @@ -445,13 +429,21 @@ union entry_union { struct IO_APIC_route_entry entry; }; -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +static struct IO_APIC_route_entry __ioapic_read_entry(int apic, int pin) { union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + return eu.entry; +} + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu = { .entry = {0} }; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.entry = __ioapic_read_entry(apic, pin); spin_unlock_irqrestore(&ioapic_lock, flags); return eu.entry; } @@ -555,23 +547,29 @@ static void __init replace_pin_at_irq_no add_pin_to_irq_node(cfg, node, newapic, newpin); } +static void __io_apic_modify_irq(struct irq_pin_list *entry, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + unsigned int reg, pin; + + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); +} + static void io_apic_modify_irq(struct irq_cfg *cfg, int mask_and, int mask_or, void (*final)(struct irq_pin_list *entry)) { - int pin; struct irq_pin_list *entry; - for_each_irq_pin(entry, cfg->irq_2_pin) { - unsigned int reg; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin * 2); - reg &= mask_and; - reg |= mask_or; - io_apic_modify(entry->apic, 0x10 + pin * 2, reg); - if (final) - final(entry); - } + for_each_irq_pin(entry, cfg->irq_2_pin) + __io_apic_modify_irq(entry, mask_and, mask_or, final); } static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) @@ -595,18 +593,6 @@ static void __mask_IO_APIC_irq(struct ir io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); } -static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED, NULL); -} - -static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg) -{ - io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER, NULL); -} - static void mask_IO_APIC_irq_desc(struct irq_desc *desc) { struct irq_cfg *cfg = desc->chip_data; @@ -650,10 +636,56 @@ static void clear_IO_APIC_pin(unsigned i entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; + + /* + * Make sure the entry is masked and re-read the contents to check + * if it is a level triggered pin and if the remote-IRR is set. + */ + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + entry = ioapic_read_entry(apic, pin); + } + + if (entry.irr) { + /* + * Make sure the trigger mode is set to level. Explicit EOI + * doesn't clear the remote-IRR if the trigger mode is not + * set to level. + */ + if (!entry.trigger) { + entry.trigger = IOAPIC_LEVEL; + ioapic_write_entry(apic, pin, entry); + } + + if (mp_ioapics[apic].apicver >= 0x20) { + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_eoi(apic, entry.vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } else { + /* + * Mechanism by which we clear remote-IRR in this + * case is by changing the trigger mode to edge and + * back to level. + */ + entry.trigger = IOAPIC_EDGE; + ioapic_write_entry(apic, pin, entry); + entry.trigger = IOAPIC_LEVEL; + ioapic_write_entry(apic, pin, entry); + } + } + /* - * Disable it in the IO-APIC irq-routing table: + * Clear the rest of the bits in the IO-APIC RTE except for the mask + * bit. */ ioapic_mask_entry(apic, pin); + entry = ioapic_read_entry(apic, pin); + if (entry.irr) + printk(KERN_ERR "Unable to reset IRR for apic: %d, pin :%d\n", + mp_ioapics[apic].apicid, pin); } static void clear_IO_APIC (void) @@ -875,7 +907,7 @@ static int __init find_isa_irq_apic(int */ static int EISA_ELCR(unsigned int irq) { - if (irq < nr_legacy_irqs) { + if (irq < legacy_pic->nr_legacy_irqs) { unsigned int port = 0x4d0 + (irq >> 3); return (inb(port) >> (irq & 7)) & 1; } @@ -1177,7 +1209,7 @@ __assign_irq_vector(int irq, struct irq_ int cpu, err; cpumask_var_t tmp_mask; - if ((cfg->move_in_progress) || cfg->move_cleanup_count) + if (cfg->move_in_progress) return -EBUSY; if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) @@ -1237,8 +1269,7 @@ next: return err; } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) +int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask) { int err; unsigned long flags; @@ -1287,6 +1318,8 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for_each_irq_desc(irq, desc) { cfg = desc->chip_data; + if (!cfg) + continue; if (!cpumask_test_cpu(cpu, cfg->domain)) continue; vector = cfg->vector; @@ -1307,10 +1340,6 @@ void __setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { @@ -1472,8 +1501,8 @@ static void setup_IO_APIC_irq(int apic_i } ioapic_register_intr(irq, desc, trigger); - if (irq < nr_legacy_irqs) - disable_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->chip->mask(irq); ioapic_write_entry(apic_id, pin, entry); } @@ -1550,6 +1579,56 @@ static void __init setup_IO_APIC_irqs(vo } /* + * for the gsit that is not in first ioapic + * but could not use acpi_register_gsi() + * like some special sci in IBM x3330 + */ +void setup_IO_APIC_irq_extra(u32 gsi) +{ + int apic_id = 0, pin, idx, irq; + int node = cpu_to_node(boot_cpu_id); + struct irq_desc *desc; + struct irq_cfg *cfg; + + /* + * Convert 'gsi' to 'ioapic.pin'. + */ + apic_id = mp_find_ioapic(gsi); + if (apic_id < 0) + return; + + pin = mp_find_ioapic_pin(apic_id, gsi); + idx = find_irq_entry(apic_id, pin, mp_INT); + if (idx == -1) + return; + + irq = pin_2_irq(idx, apic_id, pin); +#ifdef CONFIG_SPARSE_IRQ + desc = irq_to_desc(irq); + if (desc) + return; +#endif + desc = irq_to_desc_alloc_node(irq, node); + if (!desc) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + return; + } + + cfg = desc->chip_data; + add_pin_to_irq_node(cfg, node, apic_id, pin); + + if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) { + pr_debug("Pin %d-%d already programmed\n", + mp_ioapics[apic_id].apicid, pin); + return; + } + set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed); + + setup_IO_APIC_irq(apic_id, pin, irq, desc, + irq_trigger(idx), irq_polarity(idx)); +} + +/* * Set up the timer pin, possibly with the 8259A-master behind. */ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, @@ -1661,7 +1740,7 @@ __apicdebuginit(void) print_IO_APIC(void printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); + " Stat Dmod Deli Vect:\n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; @@ -1690,6 +1769,8 @@ __apicdebuginit(void) print_IO_APIC(void struct irq_pin_list *entry; cfg = desc->chip_data; + if (!cfg) + continue; entry = cfg->irq_2_pin; if (!entry) continue; @@ -1839,7 +1920,7 @@ __apicdebuginit(void) print_PIC(void) unsigned int v; unsigned long flags; - if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) + if (apic_verbosity == APIC_QUIET || !legacy_pic->nr_legacy_irqs) return; printk(KERN_DEBUG "\nprinting PIC contents\n"); @@ -1888,22 +1969,10 @@ static struct { int pin, apic; } ioapic_ void __init enable_IO_APIC(void) { - union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; int apic; - unsigned long flags; - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; for(apic = 0; apic < nr_ioapics; apic++) { @@ -1953,14 +2022,30 @@ void __init enable_IO_APIC(void) /* * Not an __init, needed by the reboot code */ -void disable_IO_APIC(void) +void disable_IO_APIC(int force) { /* + * Use force to bust the io_apic spinlock + * + * There is a case where kdump can race with irq + * migration such that kdump will inject an NMI + * while another cpu holds the ioapic_lock to + * migrate the irq. This would cause a deadlock. + * + * Because kdump stops all the cpus, we can safely + * bust the spinlock as we are just clearing the + * io apic anyway. + */ + if (force && spin_is_locked(&ioapic_lock)) + /* only one cpu should be running now */ + spin_lock_init(&ioapic_lock); + + /* * Clear the IO-APIC before rebooting: */ clear_IO_APIC(); - if (!nr_legacy_irqs) + if (!legacy_pic->nr_legacy_irqs) return; /* @@ -2193,9 +2278,9 @@ static unsigned int startup_ioapic_irq(u struct irq_cfg *cfg; spin_lock_irqsave(&ioapic_lock, flags); - if (irq < nr_legacy_irqs) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) + if (irq < legacy_pic->nr_legacy_irqs) { + legacy_pic->chip->mask(irq); + if (legacy_pic->irq_pending(irq)) was_pending = 1; } cfg = irq_cfg(irq); @@ -2228,20 +2313,16 @@ static int ioapic_retrigger_irq(unsigned */ #ifdef CONFIG_SMP -static void send_cleanup_vector(struct irq_cfg *cfg) +void send_cleanup_vector(struct irq_cfg *cfg) { cpumask_var_t cleanup_mask; if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { unsigned int i; - cfg->move_cleanup_count = 0; - for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) - cfg->move_cleanup_count++; for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); } else { cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); - cfg->move_cleanup_count = cpumask_weight(cleanup_mask); apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); free_cpumask_var(cleanup_mask); } @@ -2272,31 +2353,30 @@ static void __target_IO_APIC_irq(unsigne } } -static int -assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask); - /* * Either sets desc->affinity to a valid value, and returns - * ->cpu_mask_to_apicid of that, or returns BAD_APICID and + * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and * leaves desc->affinity untouched. */ -static unsigned int -set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) +unsigned int +set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, + unsigned int *dest_id) { struct irq_cfg *cfg; unsigned int irq; if (!cpumask_intersects(mask, cpu_online_mask)) - return BAD_APICID; + return -1; irq = desc->irq; cfg = desc->chip_data; if (assign_irq_vector(irq, cfg, mask)) - return BAD_APICID; + return -1; cpumask_copy(desc->affinity, mask); - return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); + return 0; } static int @@ -2312,12 +2392,11 @@ set_ioapic_affinity_irq_desc(struct irq_ cfg = desc->chip_data; spin_lock_irqsave(&ioapic_lock, flags); - dest = set_desc_affinity(desc, mask); - if (dest != BAD_APICID) { + ret = set_desc_affinity(desc, mask, &dest); + if (!ret) { /* Only the high 8 bits are valid. */ dest = SET_APIC_LOGICAL_ID(dest); __target_IO_APIC_irq(irq, dest, cfg); - ret = 0; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2433,7 +2512,12 @@ asmlinkage void smp_irq_move_cleanup_int cfg = irq_cfg(irq); spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) + + /* + * Check if the irq migration is in progress. If so, we + * haven't received the cleanup request yet for this irq. + */ + if (cfg->move_in_progress) goto unlock; if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) @@ -2452,7 +2536,6 @@ asmlinkage void smp_irq_move_cleanup_int goto unlock; } __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; unlock: spin_unlock(&desc->lock); } @@ -2460,21 +2543,40 @@ unlock: irq_exit(); } -static void irq_complete_move(struct irq_desc **descp) +static void __irq_complete_move(struct irq_desc **descp, unsigned vector) { struct irq_desc *desc = *descp; struct irq_cfg *cfg = desc->chip_data; - unsigned vector, me; + unsigned me; if (likely(!cfg->move_in_progress)) return; - vector = ~get_irq_regs()->orig_ax; me = smp_processor_id(); if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) send_cleanup_vector(cfg); } + +static void irq_complete_move(struct irq_desc **descp) +{ + __irq_complete_move(descp, ~get_irq_regs()->orig_ax); +} + +void irq_force_complete_move(int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + + /* + * RHEL6 PV guest on RHEL5 HV fix. Xen irq's can have NULL + * chip_data. RHEL6 fix only. + */ + if (!cfg) + return; + + __irq_complete_move(&desc, cfg->vector); +} #else static inline void irq_complete_move(struct irq_desc **descp) {} #endif @@ -2490,6 +2592,74 @@ static void ack_apic_edge(unsigned int i atomic_t irq_mis_count; +/* + * IO-APIC versions below 0x20 don't support EOI register. + * For the record, here is the information about various versions: + * 0Xh 82489DX + * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant + * 2Xh I/O(x)APIC which is PCI 2.2 Compliant + * 30h-FFh Reserved + * + * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic + * version as 0x2. This is an error with documentation and these ICH chips + * use io-apic's of version 0x20. + * + * For IO-APIC's with EOI register, we use that to do an explicit EOI. + * Otherwise, we simulate the EOI message manually by changing the trigger + * mode to edge and then back to level, with RTE being masked during this. +*/ +static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) +{ + struct irq_pin_list *entry; + + for_each_irq_pin(entry, cfg->irq_2_pin) { + if (mp_ioapics[entry->apic].apicver >= 0x20) { + /* + * Intr-remapping uses pin number as the virtual vector + * in the RTE. Actual vector is programmed in + * intr-remapping table entry. Hence for the io-apic + * EOI we use the pin number. + */ + if (irq_remapped(irq)) + io_apic_eoi(entry->apic, entry->pin); + else + io_apic_eoi(entry->apic, cfg->vector); + } else { + struct IO_APIC_route_entry rte, rte1; + + rte = rte1 = + __ioapic_read_entry(entry->apic, entry->pin); + + /* + * Mask the entry and change the trigger mode to edge. + */ + rte1.mask = 1; + rte1.trigger = IOAPIC_EDGE; + + __ioapic_write_entry(entry->apic, entry->pin, rte1); + + /* + * Restore the previous level triggered entry. + */ + __ioapic_write_entry(entry->apic, entry->pin, rte); + } + } +} + +static void eoi_ioapic_irq(struct irq_desc *desc) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int irq; + + irq = desc->irq; + cfg = desc->chip_data; + + spin_lock_irqsave(&ioapic_lock, flags); + __eoi_ioapic_irq(irq, cfg); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + static void ack_apic_level(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -2525,6 +2695,19 @@ static void ack_apic_level(unsigned int * level-triggered interrupt. We mask the source for the time of the * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro + * + * Also in the case when cpu goes offline, fixup_irqs() will forward + * any unhandled interrupt on the offlined cpu to the new cpu + * destination that is handling the corresponding interrupt. This + * interrupt forwarding is done via IPI's. Hence, in this case also + * level-triggered io-apic interrupt will be seen as an edge + * interrupt in the IRR. And we can't rely on the cpu's EOI + * to be broadcasted to the IO-APIC's which will clear the remoteIRR + * corresponding to the level-triggered interrupt. Hence on IO-APIC's + * supporting EOI register, we do an explicit EOI to clear the + * remote IRR and on IO-APIC's which don't have an EOI register, + * we use the above logic (mask+edge followed by unmask+level) from + * Manfred Spraul to clear the remote IRR. */ cfg = desc->chip_data; i = cfg->vector; @@ -2536,6 +2719,19 @@ static void ack_apic_level(unsigned int */ ack_APIC_irq(); + /* + * Tail end of clearing remote IRR bit (either by delivering the EOI + * message via io-apic EOI register write or simulating it using + * mask+edge followed by unnask+level logic) manually when the + * level triggered interrupt is seen as the edge triggered interrupt + * at the cpu. + */ + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + + eoi_ioapic_irq(desc); + } + /* Now we can move and renable the irq */ if (unlikely(do_unmask_irq)) { /* Only migrate the irq if the ack has been received. @@ -2569,41 +2765,9 @@ static void ack_apic_level(unsigned int move_masked_irq(irq); unmask_IO_APIC_irq_desc(desc); } - - /* Tail end of version 0x11 I/O APIC bug workaround */ - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(cfg); - __unmask_and_level_IO_APIC_irq(cfg); - spin_unlock(&ioapic_lock); - } } #ifdef CONFIG_INTR_REMAP -static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) -{ - struct irq_pin_list *entry; - - for_each_irq_pin(entry, cfg->irq_2_pin) - io_apic_eoi(entry->apic, entry->pin); -} - -static void -eoi_ioapic_irq(struct irq_desc *desc) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int irq; - - irq = desc->irq; - cfg = desc->chip_data; - - spin_lock_irqsave(&ioapic_lock, flags); - __eoi_ioapic_irq(irq, cfg); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - static void ir_ack_apic_edge(unsigned int irq) { ack_APIC_irq(); @@ -2671,8 +2835,8 @@ static inline void init_IO_APIC_traps(vo * so default to an old-fashioned 8259 * interrupt if we can.. */ - if (irq < nr_legacy_irqs) - make_8259A_irq(irq); + if (irq < legacy_pic->nr_legacy_irqs) + legacy_pic->make_irq(irq); else /* Strange. Oh, well.. */ desc->chip = &no_irq_chip; @@ -2829,7 +2993,7 @@ static inline void __init check_timer(vo /* * get/set the timer IRQ vector: */ - disable_8259A_irq(0); + legacy_pic->chip->mask(0); assign_irq_vector(0, cfg, apic->target_cpus()); /* @@ -2842,7 +3006,7 @@ static inline void __init check_timer(vo * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); + legacy_pic->init(1); #ifdef CONFIG_X86_32 { unsigned int ver; @@ -2899,9 +3063,10 @@ static inline void __init check_timer(vo unmask_IO_APIC_irq_desc(desc); } if (timer_irq_works()) { + nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } if (disable_timer_pin_1 > 0) clear_IO_APIC_pin(0, pin1); @@ -2924,14 +3089,15 @@ static inline void __init check_timer(vo */ replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; + nmi_watchdog_default(); if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); setup_nmi(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } goto out; } @@ -2939,7 +3105,7 @@ static inline void __init check_timer(vo * Cleanup, just in case ... */ local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); clear_IO_APIC_pin(apic2, pin2); apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } @@ -2958,22 +3124,22 @@ static inline void __init check_timer(vo lapic_register_intr(0, desc); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); if (timer_irq_works()) { apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } local_irq_disable(); - disable_8259A_irq(0); + legacy_pic->chip->mask(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as ExtINT IRQ...\n"); - init_8259A(0); - make_8259A_irq(0); + legacy_pic->init(0); + legacy_pic->make_irq(0); apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); @@ -3015,7 +3181,7 @@ void __init setup_IO_APIC(void) /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ - io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; + io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); /* @@ -3026,7 +3192,7 @@ void __init setup_IO_APIC(void) sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); - if (nr_legacy_irqs) + if (legacy_pic->nr_legacy_irqs) check_timer(); } @@ -3157,6 +3323,7 @@ unsigned int create_irq_nr(unsigned int continue; desc_new = move_irq_desc(desc_new, node); + cfg_new = desc_new->chip_data; if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) irq = new; @@ -3164,12 +3331,9 @@ unsigned int create_irq_nr(unsigned int } spin_unlock_irqrestore(&vector_lock, flags); - if (irq > 0) { - dynamic_irq_init(irq); - /* restore it, in case dynamic_irq_init clear it */ - if (desc_new) - desc_new->chip_data = cfg_new; - } + if (irq > 0) + dynamic_irq_init_keep_chip_data(irq); + return irq; } @@ -3192,26 +3356,28 @@ void destroy_irq(unsigned int irq) { unsigned long flags; struct irq_cfg *cfg; - struct irq_desc *desc; - /* store it, in case dynamic_irq_cleanup clear it */ - desc = irq_to_desc(irq); - cfg = desc->chip_data; - dynamic_irq_cleanup(irq); - /* connect back irq_cfg */ - desc->chip_data = cfg; + dynamic_irq_cleanup_keep_chip_data(irq); free_irte(irq); spin_lock_irqsave(&vector_lock, flags); + cfg = irq_to_desc(irq)->chip_data; __clear_irq_vector(irq, cfg); spin_unlock_irqrestore(&vector_lock, flags); } +int __irq_to_vector(int irq) +{ + return irq_cfg(irq)->vector; +} +EXPORT_SYMBOL(__irq_to_vector); + /* * MSI message composition */ #ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + struct msi_msg *msg, u8 hpet_id) { struct irq_cfg *cfg; int err; @@ -3245,7 +3411,10 @@ static int msi_compose_msg(struct pci_de irte.dest_id = IRTE_DEST(dest); /* Set source-id of interrupt request */ - set_msi_sid(&irte, pdev); + if (pdev) + set_msi_sid(&irte, pdev); + else + set_hpet_sid(&irte, hpet_id); modify_irte(irq, &irte); @@ -3291,13 +3460,12 @@ static int set_msi_irq_affinity(unsigned struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; - read_msi_msg_desc(desc, &msg); + get_cached_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); @@ -3324,8 +3492,7 @@ ir_set_msi_irq_affinity(unsigned int irq if (get_irte(irq, &irte)) return -1; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; irte.vector = cfg->vector; @@ -3410,7 +3577,7 @@ static int setup_msi_irq(struct pci_dev int ret; struct msi_msg msg; - ret = msi_compose_msg(dev, irq, &msg); + ret = msi_compose_msg(dev, irq, &msg, -1); if (ret < 0) return ret; @@ -3507,8 +3674,7 @@ static int dmar_msi_set_affinity(unsigne struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3519,6 +3685,7 @@ static int dmar_msi_set_affinity(unsigne msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); + msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest); dmar_msi_write(irq, &msg); @@ -3543,7 +3710,7 @@ int arch_setup_dmar_msi(unsigned int irq int ret; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); + ret = msi_compose_msg(NULL, irq, &msg, -1); if (ret < 0) return ret; dmar_msi_write(irq, &msg); @@ -3563,8 +3730,7 @@ static int hpet_msi_set_affinity(unsigne struct msi_msg msg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3583,6 +3749,19 @@ static int hpet_msi_set_affinity(unsigne #endif /* CONFIG_SMP */ +static struct irq_chip ir_hpet_msi_type = { + .name = "IR-HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, +#ifdef CONFIG_INTR_REMAP + .ack = ir_ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif +#endif + .retrigger = ioapic_retrigger_irq, +}; + static struct irq_chip hpet_msi_type = { .name = "HPET_MSI", .unmask = hpet_msi_unmask, @@ -3594,20 +3773,36 @@ static struct irq_chip hpet_msi_type = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_hpet_msi(unsigned int irq) +int arch_setup_hpet_msi(unsigned int irq, unsigned int id) { int ret; struct msi_msg msg; struct irq_desc *desc = irq_to_desc(irq); - ret = msi_compose_msg(NULL, irq, &msg); + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_hpet_to_ir(id); + int index; + + if (!iommu) + return -1; + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + return -1; + } + + ret = msi_compose_msg(NULL, irq, &msg, id); if (ret < 0) return ret; hpet_msi_write(irq, &msg); desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, - "edge"); + if (irq_remapped(irq)) + set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, + handle_edge_irq, "edge"); + else + set_irq_chip_and_handler_name(irq, &hpet_msi_type, + handle_edge_irq, "edge"); return 0; } @@ -3641,8 +3836,7 @@ static int set_ht_irq_affinity(unsigned struct irq_cfg *cfg; unsigned int dest; - dest = set_desc_affinity(desc, mask); - if (dest == BAD_APICID) + if (set_desc_affinity(desc, mask, &dest)) return -1; cfg = desc->chip_data; @@ -3708,75 +3902,6 @@ int arch_setup_ht_irq(unsigned int irq, } #endif /* CONFIG_HT_IRQ */ -#ifdef CONFIG_X86_UV -/* - * Re-target the irq to the specified CPU and enable the specified MMR located - * on the specified blade to allow the sending of MSIs to the specified CPU. - */ -int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset) -{ - const struct cpumask *eligible_cpu = cpumask_of(cpu); - struct irq_cfg *cfg; - int mmr_pnode; - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - unsigned long flags; - int err; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - cfg = irq_cfg(irq); - - err = assign_irq_vector(irq, cfg, eligible_cpu); - if (err != 0) - return err; - - spin_lock_irqsave(&vector_lock, flags); - set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, - irq_name); - spin_unlock_irqrestore(&vector_lock, flags); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->vector = cfg->vector; - entry->delivery_mode = apic->irq_delivery_mode; - entry->dest_mode = apic->irq_dest_mode; - entry->polarity = 0; - entry->trigger = 0; - entry->mask = 0; - entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); - - if (cfg->move_in_progress) - send_cleanup_vector(cfg); - - return irq; -} - -/* - * Disable the specified MMR located on the specified blade so that MSIs are - * longer allowed to be sent. - */ -void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) -{ - unsigned long mmr_value; - struct uv_IO_APIC_route_entry *entry; - int mmr_pnode; - - BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); - - mmr_value = 0; - entry = (struct uv_IO_APIC_route_entry *)&mmr_value; - entry->mask = 1; - - mmr_pnode = uv_blade_to_pnode(mmr_blade); - uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); -} -#endif /* CONFIG_X86_64 */ - int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3811,6 +3936,11 @@ void __init probe_nr_irqs_gsi(void) printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } +int get_nr_irqs_gsi(void) +{ + return nr_irqs_gsi; +} + #ifdef CONFIG_SPARSE_IRQ int __init arch_probe_nr_irqs(void) { @@ -3867,7 +3997,7 @@ static int __io_apic_set_pci_routing(str /* * IRQs < 16 are already in the irq_2_pin[] map */ - if (irq >= nr_legacy_irqs) { + if (irq >= legacy_pic->nr_legacy_irqs) { cfg = desc->chip_data; if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { printk(KERN_INFO "can not add pin %d for irq %d\n", @@ -4140,18 +4270,17 @@ void __init ioapic_init_mappings(void) #ifdef CONFIG_X86_32 fake_ioapic_page: #endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK), + ioapic_phys); idx++; ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; ioapic_res++; } } @@ -4217,6 +4346,7 @@ static int bad_ioapic(unsigned long addr void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) { int idx = 0; + int entries; if (bad_ioapic(address)) return; @@ -4235,9 +4365,16 @@ void __init mp_register_ioapic(int id, u * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ + entries = io_apic_get_redir_entries(idx); mp_gsi_routing[idx].gsi_base = gsi_base; - mp_gsi_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); + mp_gsi_routing[idx].gsi_end = gsi_base + entries; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + nr_ioapic_registers[idx] = entries + 1; + if (mp_gsi_routing[idx].gsi_end >= gsi_top) + gsi_top = mp_gsi_routing[idx].gsi_end + 1; printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, diff -uprN linux-2.6.32/arch/x86/kernel/apic/ipi.c linux-2.6.32.ovz/arch/x86/kernel/apic/ipi.c --- linux-2.6.32/arch/x86/kernel/apic/ipi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/ipi.c 2015-03-10 13:24:14.000000000 -0700 @@ -106,7 +106,7 @@ void default_send_IPI_mask_logical(const unsigned long mask = cpumask_bits(cpumask)[0]; unsigned long flags; - if (WARN_ONCE(!mask, "empty IPI mask")) + if (!mask) return; local_irq_save(flags); diff -uprN linux-2.6.32/arch/x86/kernel/apic/Makefile linux-2.6.32.ovz/arch/x86/kernel/apic/Makefile --- linux-2.6.32/arch/x86/kernel/apic/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/Makefile 2015-03-10 13:24:02.000000000 -0700 @@ -2,7 +2,12 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o +ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) +obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o +endif +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o + obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff -uprN linux-2.6.32/arch/x86/kernel/apic/nmi.c linux-2.6.32.ovz/arch/x86/kernel/apic/nmi.c --- linux-2.6.32/arch/x86/kernel/apic/nmi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/nmi.c 2015-03-10 13:24:12.000000000 -0700 @@ -36,7 +36,6 @@ #include -int unknown_nmi_panic; int nmi_watchdog_enabled; static cpumask_t backtrace_mask __read_mostly; @@ -50,15 +49,77 @@ static cpumask_t backtrace_mask __read_m atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ EXPORT_SYMBOL(nmi_active); +#ifdef CONFIG_X86_64 +unsigned int nmi_watchdog = NMI_DEFAULT; +#else unsigned int nmi_watchdog = NMI_NONE; +#endif EXPORT_SYMBOL(nmi_watchdog); static int panic_on_timeout; static unsigned int nmi_hz = HZ; static DEFINE_PER_CPU(short, wd_enabled); +static DEFINE_PER_CPU(struct hrtimer, nmi_watchdog_hrtimer); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static int endflag __initdata; +static unsigned long get_sample_period(void) +{ + unsigned long count = NSEC_PER_SEC; + + /* + * for the NMI_LOCAL_APIC case: + * nmi_hz is set to the period of the nmi watchdog + * a timer is needed to match that frequency otherwise + * the nmi watchdog thinks the cpu is deadlock + * nb: the nmi watchdog can handle 5 misses before declaring a + * deadlock, so match the frequency in this case is ok + */ + if (nmi_watchdog == NMI_LOCAL_APIC) + do_div(count, nmi_hz); + + return count; +} + +/* watchdog kicker functions */ +static enum hrtimer_restart nmi_watchdog_timer_fn(struct hrtimer *hrtimer) +{ + /* kick the hardlockup detector */ + __get_cpu_var(hrtimer_interrupts)++; + + /* .. and repeat */ + hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + + return HRTIMER_RESTART; +} + +#ifdef CONFIG_X86_64 +void __cpuinit nmi_watchdog_default(void) +{ + struct hrtimer *hrtimer = &__get_cpu_var(nmi_watchdog_hrtimer); + + if (nmi_watchdog != NMI_DEFAULT) + return; + /* if not specified, probe it */ + if (!lapic_watchdog_init(nmi_hz)) + nmi_watchdog = NMI_LOCAL_APIC; + else { + cpu_nmi_set_wd_enabled(); + nmi_watchdog = NMI_IO_APIC; + } + atomic_inc(&nmi_active); + + /* kick off hrtimers to use to determine if interrupts are working */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = nmi_watchdog_timer_fn; + hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + HRTIMER_MODE_REL_PINNED); +} +#else +void __cpuinit nmi_watchdog_default(void) { return; } +#endif + static inline unsigned int get_nmi_count(int cpu) { return per_cpu(irq_stat, cpu).__nmi_count; @@ -78,8 +139,7 @@ static inline int mce_in_progress(void) */ static inline unsigned int get_timer_irqs(int cpu) { - return per_cpu(irq_stat, cpu).apic_timer_irqs + - per_cpu(irq_stat, cpu).irq0_irqs; + return per_cpu(hrtimer_interrupts, cpu); } #ifdef CONFIG_SMP @@ -106,6 +166,8 @@ static __init void nmi_cpu_busy(void *da static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { + struct hrtimer *hrtimer = &per_cpu(nmi_watchdog_hrtimer, cpu); + printk(KERN_CONT "\n"); printk(KERN_WARNING @@ -119,6 +181,7 @@ static void report_broken_nmi(int cpu, u per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); + hrtimer_cancel(hrtimer); } static void __acpi_nmi_disable(void *__unused) @@ -140,20 +203,20 @@ int __init check_nmi_watchdog(void) printk(KERN_INFO "Testing NMI watchdog ... "); + for_each_possible_cpu(cpu) + prev_nmi_count[cpu] = get_nmi_count(cpu); #ifdef CONFIG_SMP if (nmi_watchdog == NMI_LOCAL_APIC) smp_call_function(nmi_cpu_busy, (void *)&endflag, 0); #endif - for_each_possible_cpu(cpu) - prev_nmi_count[cpu] = get_nmi_count(cpu); local_irq_enable(); - mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */ + mdelay((40 * 1000) / nmi_hz); /* wait 40 ticks */ for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) + if (get_nmi_count(cpu) - prev_nmi_count[cpu] == 0) report_broken_nmi(cpu, prev_nmi_count); } endflag = 1; @@ -176,7 +239,7 @@ int __init check_nmi_watchdog(void) error: if (nmi_watchdog == NMI_IO_APIC) { if (!timer_through_8259) - disable_8259A_irq(0); + legacy_pic->chip->mask(0); on_each_cpu(__acpi_nmi_disable, NULL, 1); } @@ -309,6 +372,8 @@ void cpu_nmi_set_wd_enabled(void) void setup_apic_nmi_watchdog(void *unused) { + struct hrtimer *hrtimer = &__get_cpu_var(nmi_watchdog_hrtimer); + if (__get_cpu_var(wd_enabled)) return; @@ -327,16 +392,28 @@ void setup_apic_nmi_watchdog(void *unuse case NMI_IO_APIC: __get_cpu_var(wd_enabled) = 1; atomic_inc(&nmi_active); + + /* kick off hrtimers to use to determine if interrupts are working */ + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = nmi_watchdog_timer_fn; + hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + HRTIMER_MODE_REL_PINNED); } } void stop_apic_nmi_watchdog(void *unused) { + struct hrtimer *hrtimer = &__get_cpu_var(nmi_watchdog_hrtimer); + /* only support LOCAL and IO APICs for now */ if (!nmi_watchdog_active()) return; if (__get_cpu_var(wd_enabled) == 0) return; + + /* disable the hrtimer */ + hrtimer_cancel(hrtimer); + if (nmi_watchdog == NMI_LOCAL_APIC) lapic_watchdog_stop(); else @@ -399,13 +476,6 @@ nmi_watchdog_tick(struct pt_regs *regs, int cpu = smp_processor_id(); int rc = 0; - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - sum = get_timer_irqs(cpu); if (__get_cpu_var(nmi_touch)) { @@ -438,7 +508,7 @@ nmi_watchdog_tick(struct pt_regs *regs, * wait a few IRQs (5 seconds) before doing the oops ... */ local_inc(&__get_cpu_var(alert_counter)); - if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) + if (local_read(&__get_cpu_var(alert_counter)) == CONFIG_DEBUG_NMI_TIMEOUT * nmi_hz) /* * die_nmi will return ONLY if NOTIFY_STOP happens.. */ @@ -488,13 +558,6 @@ static void disable_ioapic_nmi_watchdog( on_each_cpu(stop_apic_nmi_watchdog, NULL, 1); } -static int __init setup_unknown_nmi_panic(char *str) -{ - unknown_nmi_panic = 1; - return 1; -} -__setup("unknown_nmi_panic", setup_unknown_nmi_panic); - static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) { unsigned char reason = get_nmi_reason(); diff -uprN linux-2.6.32/arch/x86/kernel/apic/numaq_32.c linux-2.6.32.ovz/arch/x86/kernel/apic/numaq_32.c --- linux-2.6.32/arch/x86/kernel/apic/numaq_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/numaq_32.c 2015-03-10 13:23:23.000000000 -0700 @@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(stru mpc_record = 0; printk(KERN_INFO - "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); + "Found an OEM MPC table at %8p - parsing it...\n", oemtable); if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { printk(KERN_WARNING @@ -547,6 +547,7 @@ struct apic __refdata apic_numaq = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff -uprN linux-2.6.32/arch/x86/kernel/apic/probe_32.c linux-2.6.32.ovz/arch/x86/kernel/apic/probe_32.c --- linux-2.6.32/arch/x86/kernel/apic/probe_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/probe_32.c 2015-03-10 13:23:23.000000000 -0700 @@ -52,7 +52,32 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void default_setup_apic_routing(void) +void __init default_setup_apic_routing(void) +{ + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + +#ifdef CONFIG_X86_BIGSMP + generic_bigsmp_probe(); +#endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); +} + +static void setup_apic_flat_routing(void) { #ifdef CONFIG_X86_IO_APIC printk(KERN_INFO @@ -103,7 +128,7 @@ struct apic apic_default = { .init_apic_ldr = default_init_apic_ldr, .ioapic_phys_id_map = default_ioapic_phys_id_map, - .setup_apic_routing = default_setup_apic_routing, + .setup_apic_routing = setup_apic_flat_routing, .multi_timer_check = NULL, .apicid_to_node = default_apicid_to_node, .cpu_to_logical_apicid = default_cpu_to_logical_apicid, @@ -138,6 +163,7 @@ struct apic apic_default = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, @@ -153,7 +179,7 @@ extern struct apic apic_es7000_cluster; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); -static struct apic *apic_probe[] __initdata = { +struct apic *apic_probe[] __initdata = { #ifdef CONFIG_X86_NUMAQ &apic_numaq, #endif diff -uprN linux-2.6.32/arch/x86/kernel/apic/probe_64.c linux-2.6.32.ovz/arch/x86/kernel/apic/probe_64.c --- linux-2.6.32/arch/x86/kernel/apic/probe_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/probe_64.c 2015-03-10 13:23:53.000000000 -0700 @@ -32,7 +32,7 @@ extern struct apic apic_x2apic_cluster; struct apic __read_mostly *apic = &apic_flat; EXPORT_SYMBOL_GPL(apic); -static struct apic *apic_probe[] __initdata = { +struct apic *apic_probe[] __initdata = { #ifdef CONFIG_X86_UV &apic_x2apic_uv_x, #endif @@ -41,6 +41,7 @@ static struct apic *apic_probe[] __initd &apic_x2apic_cluster, #endif &apic_physflat, + &apic_flat, /* must be last */ NULL, }; @@ -54,44 +55,23 @@ static int apicid_phys_pkg_id(int initia */ void __init default_setup_apic_routing(void) { -#ifdef CONFIG_X86_X2APIC - if (x2apic_mode -#ifdef CONFIG_X86_UV - && apic != &apic_x2apic_uv_x -#endif - ) { - if (x2apic_phys) - apic = &apic_x2apic_phys; - else - apic = &apic_x2apic_cluster; - } -#endif + int i; + + enable_IR_x2apic(); - if (apic == &apic_flat) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (num_processors > 8) - apic = &apic_physflat; + for (i = 0; apic_probe[i]; ++i) { + if (apic_probe[i]->probe()) { + apic = apic_probe[i]; break; - case X86_VENDOR_AMD: - if (max_physical_apicid >= 8) - apic = &apic_physflat; } } - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + printk(KERN_INFO "APIC routing finalized to %s.\n", apic->name); if (is_vsmp_box()) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; } - - /* - * Now that apic routing model is selected, configure the - * fault handling for intr remapping. - */ - if (intr_remapping_enabled) - enable_drhd_fault_handling(); } /* Same for both flat and physical. */ diff -uprN linux-2.6.32/arch/x86/kernel/apic/summit_32.c linux-2.6.32.ovz/arch/x86/kernel/apic/summit_32.c --- linux-2.6.32/arch/x86/kernel/apic/summit_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/summit_32.c 2015-03-10 13:23:23.000000000 -0700 @@ -561,6 +561,7 @@ struct apic apic_summit = { .read = native_apic_mem_read, .write = native_apic_mem_write, + .eoi_write = native_apic_mem_write, .icr_read = native_apic_icr_read, .icr_write = native_apic_icr_write, .wait_icr_idle = native_apic_wait_icr_idle, diff -uprN linux-2.6.32/arch/x86/kernel/apic/x2apic_cluster.c linux-2.6.32.ovz/arch/x86/kernel/apic/x2apic_cluster.c --- linux-2.6.32/arch/x86/kernel/apic/x2apic_cluster.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/x2apic_cluster.c 2015-03-10 13:23:53.000000000 -0700 @@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const stru break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_logical_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_logical_apicid, cpu); } static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) @@ -187,10 +184,15 @@ static void init_x2apic_ldr(void) per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); } +static int x2apic_cluster_probe(void) +{ + return x2apic_mode; +} + struct apic apic_x2apic_cluster = { .name = "cluster x2apic", - .probe = NULL, + .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .apic_id_registered = x2apic_apic_id_registered, @@ -240,6 +242,7 @@ struct apic apic_x2apic_cluster = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff -uprN linux-2.6.32/arch/x86/kernel/apic/x2apic_phys.c linux-2.6.32.ovz/arch/x86/kernel/apic/x2apic_phys.c --- linux-2.6.32/arch/x86/kernel/apic/x2apic_phys.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/x2apic_phys.c 2015-03-10 13:23:53.000000000 -0700 @@ -19,12 +19,19 @@ static int set_x2apic_phys_mode(char *ar } early_param("x2apic_phys", set_x2apic_phys_mode); +static bool x2apic_fadt_phys(void) +{ + if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "System requires x2apic physical mode\n"); + return true; + } + return false; +} + static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (x2apic_phys) - return x2apic_enabled(); - else - return 0; + return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys()); } /* @@ -146,10 +153,7 @@ x2apic_cpu_mask_to_apicid_and(const stru break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu); } static unsigned int x2apic_phys_get_apic_id(unsigned long x) @@ -176,10 +180,18 @@ static void init_x2apic_ldr(void) { } +static int x2apic_phys_probe(void) +{ + if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys())) + return 1; + + return apic == &apic_x2apic_phys; +} + struct apic apic_x2apic_phys = { .name = "physical x2apic", - .probe = NULL, + .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .apic_id_registered = x2apic_apic_id_registered, @@ -229,6 +241,7 @@ struct apic apic_x2apic_phys = { .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, diff -uprN linux-2.6.32/arch/x86/kernel/apic/x2apic_uv_x.c linux-2.6.32.ovz/arch/x86/kernel/apic/x2apic_uv_x.c --- linux-2.6.32/arch/x86/kernel/apic/x2apic_uv_x.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apic/x2apic_uv_x.c 2015-03-10 13:24:14.000000000 -0700 @@ -5,7 +5,7 @@ * * SGI UV APIC functions (note: not an Intel compatible APIC) * - * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 2007-2014 Silicon Graphics, Inc. All rights reserved. */ #include #include @@ -20,6 +20,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -30,33 +34,129 @@ #include #include #include +#include +#include +#include + +/* BMC sets a bit this MMR non-zero before sending an NMI */ +#define UVH_NMI_MMR UVH_SCRATCH5 +#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8) +#define UV_NMI_PENDING_MASK (1UL << 63) DEFINE_PER_CPU(int, x2apic_extra_bits); +#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args) + static enum uv_system_type uv_system_type; +static u64 gru_start_paddr, gru_end_paddr; +static union uvh_apicid uvh_apicid; +int uv_min_hub_revision_id; +EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); +unsigned int uv_apicid_hibits; +EXPORT_SYMBOL_GPL(uv_apicid_hibits); +static DEFINE_SPINLOCK(uv_nmi_lock); -static int early_get_nodeid(void) +static unsigned long __init uv_early_read_mmr(unsigned long addr) { - union uvh_node_id_u node_id; - unsigned long *mmr; + unsigned long val, *mmr; - mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); - node_id.v = *mmr; + mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr)); + val = *mmr; early_iounmap(mmr, sizeof(*mmr)); - return node_id.s.node_id; + return val; +} + +static inline bool is_GRU_range(u64 start, u64 end) +{ + return start >= gru_start_paddr && end <= gru_end_paddr; +} + +static bool uv_is_untracked_pat_range(u64 start, u64 end) +{ + return is_ISA_range(start, end) || is_GRU_range(start, end); +} + +static int __init early_get_pnodeid(void) +{ + union uvh_node_id_u node_id; + union uvh_rh_gam_config_mmr_u m_n_config; + int pnode; + + /* Currently, all blades have same revision number */ + node_id.v = uv_early_read_mmr(UVH_NODE_ID); + m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); + uv_min_hub_revision_id = node_id.s.revision; + + switch (node_id.s.part_number) { + case UV2_HUB_PART_NUMBER: + case UV2_HUB_PART_NUMBER_X: + uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; + break; + case UV3_HUB_PART_NUMBER: + case UV3_HUB_PART_NUMBER_X: + uv_min_hub_revision_id += UV3_HUB_REVISION_BASE; + break; + } + + uv_hub_info_hub_revision = uv_min_hub_revision_id; + pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); + return pnode; } +/* + * Add an extra bit as dictated by bios to the destination apicid of + * interrupts potentially passing through the UV HUB. This prevents + * a deadlock between interrupts and IO port operations. + */ +static void __init uv_set_apicid_hibit(void) +{ + union uv1h_lb_target_physical_apic_id_mask_u apicid_mask; + + if (is_uv1_hub()) { + apicid_mask.v = + uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK); + uv_apicid_hibits = + apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK; + } +} + +static void __init early_get_apic_pnode_shift(void) +{ + uvh_apicid.v = uv_early_read_mmr(UVH_APICID); + if (!uvh_apicid.v) + /* + * Old bios, use default value + */ + uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT; +} + + static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (!strcmp(oem_id, "SGI")) { + int pnodeid, is_uv1, is_uv2, is_uv3; + + is_uv1 = !strcmp(oem_id, "SGI"); + is_uv2 = !strcmp(oem_id, "SGI2"); + is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */ + if (is_uv1 || is_uv2 || is_uv3) { + uv_hub_info_hub_revision = + (is_uv1 ? UV1_HUB_REVISION_BASE : + (is_uv2 ? UV2_HUB_REVISION_BASE : + UV3_HUB_REVISION_BASE)); + usevirtefi = 1; /* Use virtual EFI mode on SGI systems*/ + pnodeid = early_get_pnodeid(); + early_get_apic_pnode_shift(); + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; if (!strcmp(oem_table_id, "UVL")) uv_system_type = UV_LEGACY_APIC; else if (!strcmp(oem_table_id, "UVX")) uv_system_type = UV_X2APIC; else if (!strcmp(oem_table_id, "UVH")) { __get_cpu_var(x2apic_extra_bits) = - early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); + pnodeid << uvh_apicid.s.pnode_shift; uv_system_type = UV_NON_UNIQUE_APIC; + uv_set_apicid_hibit(); return 1; } } @@ -77,8 +177,12 @@ EXPORT_SYMBOL_GPL(is_uv_system); DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); +DEFINE_PER_CPU(struct uv_hub_info_extra_s, __uv_hub_info_extra); +EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info_extra); + struct uv_blade_info *uv_blade_info; EXPORT_SYMBOL_GPL(uv_blade_info); +static struct uv_blade_info_nmi *uv_blade_info_nmi; short *uv_node_to_blade; EXPORT_SYMBOL_GPL(uv_node_to_blade); @@ -92,11 +196,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades); unsigned long sn_rtc_cycles_per_second; EXPORT_SYMBOL(sn_rtc_cycles_per_second); -/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ - static const struct cpumask *uv_target_cpus(void) { - return cpumask_of(0); + return cpu_online_mask; } static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) @@ -112,12 +214,12 @@ static int __cpuinit uv_wakeup_secondary int pnode; pnode = uv_apicid_to_pnode(phys_apicid); + phys_apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | APIC_DM_INIT; uv_write_global_mmr64(pnode, UVH_IPI_INT, val); - mdelay(10); val = (1UL << UVH_IPI_INT_SEND_SHFT) | (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | @@ -193,7 +295,7 @@ static unsigned int uv_cpu_mask_to_apici int cpu = cpumask_first(cpumask); if ((unsigned)cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); + return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; else return BAD_APICID; } @@ -212,10 +314,7 @@ uv_cpu_mask_to_apicid_and(const struct c if (cpumask_test_cpu(cpu, cpu_online_mask)) break; } - if (cpu < nr_cpu_ids) - return per_cpu(x86_cpu_to_apicid, cpu); - - return BAD_APICID; + return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits; } static unsigned int x2apic_get_apic_id(unsigned long x) @@ -253,10 +352,15 @@ static void uv_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } +static int uv_probe(void) +{ + return apic == &apic_x2apic_uv_x; +} + struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", - .probe = NULL, + .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, .apic_id_registered = uv_apic_id_registered, @@ -307,6 +411,7 @@ struct apic __refdata apic_x2apic_uv_x = .read = native_apic_msr_read, .write = native_apic_msr_write, + .eoi_write = native_apic_msr_eoi_write, .icr_read = native_x2apic_icr_read, .icr_write = native_x2apic_icr_write, .wait_icr_idle = native_x2apic_wait_icr_idle, @@ -315,7 +420,7 @@ struct apic __refdata apic_x2apic_uv_x = static __cpuinit void set_x2apic_extra_bits(int pnode) { - __get_cpu_var(x2apic_extra_bits) = (pnode << 6); + __get_cpu_var(x2apic_extra_bits) = (pnode << uvh_apicid.s.pnode_shift); } /* @@ -339,14 +444,28 @@ struct redir_addr { #define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT static __initdata struct redir_addr redir_addrs[] = { - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, - {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR}, + {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR}, }; +static unsigned char get_n_lshift(int m_val) +{ + union uv3h_gr0_gam_gr_config_u m_gr_config; + + if (is_uv1_hub()) + return m_val; + + if (is_uv2_hub()) + return m_val == 40 ? 40 : 39; + + m_gr_config.v = uv_read_local_mmr(UV3H_GR0_GAM_GR_CONFIG); + return m_gr_config.s3.m_skt; +} + static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) { - union uvh_si_alias0_overlay_config_u alias; + union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias; union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; int i; @@ -364,29 +483,37 @@ static __init void get_lowmem_redirect(u enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, - int max_pnode, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int pshift, + int bshift, int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; - paddr = base << shift; - bytes = (1UL << shift) * (max_pnode + 1); - printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, - paddr + bytes); + paddr = base << pshift; + bytes = (1UL << bshift) * (max_pnode + 1); + if (!paddr) { + pr_info("UV: Map %s_HI base address NULL\n", id); + return; + } + pr_info("UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) init_extra_mapping_uc(paddr, bytes); else init_extra_mapping_wb(paddr, bytes); - } + static __init void map_gru_high(int max_pnode) { union uvh_rh_gam_gru_overlay_config_mmr_u gru; int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); - if (gru.s.enable) - map_high("GRU", gru.s.base, shift, max_pnode, map_wb); + if (gru.s.enable) { + map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb); + gru_start_paddr = ((u64)gru.s.base << shift); + gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1); + } else { + pr_info("UV: GRU disabled\n"); + } } static __init void map_mmr_high(int max_pnode) @@ -396,17 +523,154 @@ static __init void map_mmr_high(int max_ mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); + map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc); + else + pr_info("UV: MMR disabled\n"); } -static __init void map_mmioh_high(int max_pnode) +/* + * This commonality works because both 0 & 1 versions of the MMIOH OVERLAY + * and REDIRECT MMR regs are exactly the same on UV3. + */ +struct mmioh_config { + unsigned long overlay; + unsigned long redirect; + char *id; +}; + +static __initdata struct mmioh_config mmiohs[] = { + { + UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR, + UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR, + "MMIOH0" + }, + { + UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG1_MMR, + UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG1_MMR, + "MMIOH1" + }, +}; + +static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) +{ + union uv3h_rh_gam_mmioh_overlay_config0_mmr_u overlay; + unsigned long mmr; + unsigned long base; + int i, n, shift, m_io, max_io; + int nasid, lnasid, fi, li; + char *id; + + id = mmiohs[index].id; + overlay.v = uv_read_local_mmr(mmiohs[index].overlay); + pr_info("UV: %s overlay 0x%lx base:0x%x m_io:%d\n", + id, overlay.v, overlay.s3.base, overlay.s3.m_io); + if (!overlay.s3.enable) { + pr_info("UV: %s disabled\n", id); + return; + } + + shift = UV3H_RH_GAM_MMIOH_OVERLAY_CONFIG0_MMR_BASE_SHFT; + base = (unsigned long)overlay.s3.base; + m_io = overlay.s3.m_io; + mmr = mmiohs[index].redirect; + n = UV3H_RH_GAM_MMIOH_REDIRECT_CONFIG0_MMR_DEPTH; + min_pnode *= 2; /* convert to NASID */ + max_pnode *= 2; + max_io = lnasid = fi = li = -1; + + for (i = 0; i < n; i++) { + union uv3h_rh_gam_mmioh_redirect_config0_mmr_u redirect; + + redirect.v = uv_read_local_mmr(mmr + i * 8); + nasid = redirect.s3.nasid; + if (nasid < min_pnode || max_pnode < nasid) + nasid = -1; /* invalid NASID */ + + if (nasid == lnasid) { + li = i; + if (i != n-1) /* last entry check */ + continue; + } + + /* check if we have a cached (or last) redirect to print */ + if (lnasid != -1 || (i == n-1 && nasid != -1)) { + unsigned long addr1, addr2; + int f, l; + + if (lnasid == -1) { + f = l = i; + lnasid = nasid; + } else { + f = fi; + l = li; + } + addr1 = (base << shift) + + f * (unsigned long)(1 << m_io); + addr2 = (base << shift) + + (l + 1) * (unsigned long)(1 << m_io); + pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", + id, fi, li, lnasid, addr1, addr2); + if (max_io < l) + max_io = l; + } + fi = li = i; + lnasid = nasid; + } + + pr_info("UV: %s base:0x%lx shift:%d M_IO:%d MAX_IO:%d\n", + id, base, shift, m_io, max_io); + + if (max_io >= 0) + map_high(id, base, shift, m_io, max_io, map_uc); +} + +static __init void map_mmioh_high(int min_pnode, int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; - int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + unsigned long mmr, base; + int shift, enable, m_io, n_io; - mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); - if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); + if (is_uv3_hub()) { + /* Map both MMIOH Regions */ + map_mmioh_high_uv3(0, min_pnode, max_pnode); + map_mmioh_high_uv3(1, min_pnode, max_pnode); + return; + } + + if (is_uv1_hub()) { + mmr = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s1.enable; + base = mmioh.s1.base; + m_io = mmioh.s1.m_io; + n_io = mmioh.s1.n_io; + } else if (is_uv2_hub()) { + mmr = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR; + shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; + mmioh.v = uv_read_local_mmr(mmr); + enable = !!mmioh.s2.enable; + base = mmioh.s2.base; + m_io = mmioh.s2.m_io; + n_io = mmioh.s2.n_io; + } else + return; + + if (enable) { + max_pnode &= (1 << n_io) - 1; + pr_info( + "UV: base:0x%lx shift:%d N_IO:%d M_IO:%d max_pnode:0x%x\n", + base, shift, m_io, n_io, max_pnode); + map_high("MMIOH", base, shift, m_io, max_pnode, map_uc); + } else { + pr_info("UV: MMIOH disabled\n"); + } +} + +static __init void map_low_mmrs(void) +{ + init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE); + init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE); } static __init void uv_rtc_init(void) @@ -452,7 +716,7 @@ static void uv_heartbeat(unsigned long i static void __cpuinit uv_heartbeat_enable(int cpu) { - if (!uv_cpu_hub_info(cpu)->scir.enabled) { + while (!uv_cpu_hub_info(cpu)->scir.enabled) { struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); @@ -460,11 +724,10 @@ static void __cpuinit uv_heartbeat_enabl timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_hub_info(cpu)->scir.enabled = 1; - } - /* check boot cpu */ - if (!uv_cpu_hub_info(0)->scir.enabled) - uv_heartbeat_enable(0); + /* also ensure that boot cpu is enabled */ + cpu = 0; + } } #ifdef CONFIG_HOTPLUG_CPU @@ -523,6 +786,30 @@ late_initcall(uv_init_heartbeat); #endif /* !CONFIG_HOTPLUG_CPU */ +/* Direct Legacy VGA I/O traffic to designated IOH */ +int uv_set_vga_state(struct pci_dev *pdev, bool decode, + unsigned int command_bits, u32 flags) +{ + int domain, bus, rc; + + PR_DEVEL("devfn %x decode %d cmd %x flags %d\n", + pdev->devfn, decode, command_bits, flags); + + if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE)) + return 0; + + if ((command_bits & PCI_COMMAND_IO) == 0) + return 0; + + domain = pci_domain_nr(pdev->bus); + bus = pdev->bus->number; + + rc = uv_bios_set_legacy_vga_target(decode, domain, bus); + PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc); + + return rc; +} + /* * Called on each cpu to initialize the per_cpu UV data area. * FIXME: hotplug not supported yet @@ -539,40 +826,162 @@ void __cpuinit uv_cpu_init(void) set_x2apic_extra_bits(uv_hub_info->pnode); } +/* + * When NMI is received, print a stack trace. + */ +int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) +{ + unsigned long real_uv_nmi; + int bid; + int cnt = 0; + + if (reason != DIE_NMIUNKNOWN) + return NOTIFY_OK; + + if (in_crash_kexec) + /* do nothing if entering the crash kernel */ + return NOTIFY_OK; + + /* + * Each blade has an MMR that indicates when an NMI has been sent + * to cpus on the blade. If an NMI is detected, atomically + * clear the MMR and update a per-blade NMI count used to + * cause each cpu on the blade to notice a new NMI. + */ + bid = uv_numa_blade_id(); + spin_lock(&uv_blade_info_nmi[bid].nmi_lock); + + real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK); + if (unlikely(real_uv_nmi)) { + uv_blade_info_nmi[bid].nmi_count++; + if (uv_blade_info_nmi[bid].nmi_count == + uv_blade_info[bid].nr_online_cpus) { + uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK); + uv_blade_info_nmi[bid].nmi_count = 0; + } + } + spin_unlock(&uv_blade_info_nmi[bid].nmi_lock); + if (likely(! real_uv_nmi)) + return NOTIFY_OK; + + /* + * Use a lock so only one cpu prints at a time. + * This prevents intermixed output. + * + * On large systems (> 1024 cpus) the time required to dump all the + * the stacks exceeds the threshold for the nmi_watchdog() which can + * then trigger another round of NMIs garbling the output and bringing + * the system to its knees. + * + * Since we only reach this code from 'power nmi' at the CMC of the + * UV system, and we only expect that sequence from a customer in + * response to a machine which is seriously hung, this code's + * performance is not a concern. + * + * So, we just want each cpu waiting for the lock to disable its own + * watchdog timer until it has printed its own stack. Currently, + * touch_nmi_watchdog() loops over active CPUs but Don is trying to + * push a change in that behavior upstream. + * + * Since performance is not a concern, we just loop about 5 seconds, + * using udelay(1) accounts for variation in processor speed. + * Usually we will grab the lock and exit the loop. On a large + * system some cpus will reach the timeout and reset the nmi watchdog + * timers. + * + * Note: the timeout upstream is 10secs and the timeout in RHEL is + * 60secs. + */ + + while (!spin_trylock(&uv_nmi_lock)) { + udelay(1); + if (++cnt > 5000000) { + touch_nmi_watchdog(); + cnt = 0; + } + } + pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id()); + dump_stack(); + spin_unlock(&uv_nmi_lock); + + return NOTIFY_STOP; +} + +static struct notifier_block uv_dump_stack_nmi_nb = { + .notifier_call = uv_handle_nmi, + .priority = NMI_LOCAL_LOW_PRIOR - 1, +}; + +void uv_register_nmi_notifier(void) +{ + if (register_die_notifier(&uv_dump_stack_nmi_nb)) + printk(KERN_WARNING "UV NMI handler failed to register\n"); +} + +void uv_nmi_init(void) +{ + unsigned int value; + + /* + * Unmask NMI on all cpus + */ + value = apic_read(APIC_LVT1) | APIC_DM_NMI; + value &= ~APIC_LVT_MASKED; + apic_write(APIC_LVT1, value); +} void __init uv_system_init(void) { - union uvh_si_addr_map_config_u m_n_config; + union uvh_rh_gam_config_mmr_u m_n_config; union uvh_node_id_u node_id; unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; - int gnode_extra, max_pnode = 0; + int gnode_extra, min_pnode = 999999, max_pnode = -1; unsigned long mmr_base, present, paddr; unsigned short pnode_mask; + unsigned char n_lshift; + char *hub = (is_uv1_hub() ? "UV1" : + (is_uv2_hub() ? "UV2" : + "UV3")); - m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); + pr_info("UV: Found %s hub\n", hub); + map_low_mmrs(); + + m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); m_val = m_n_config.s.m_skt; n_val = m_n_config.s.n_skt; + pnode_mask = (1 << n_val) - 1; + n_lshift = get_n_lshift(m_val); mmr_base = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & ~UV_MMR_ENABLE; - pnode_mask = (1 << n_val) - 1; + node_id.v = uv_read_local_mmr(UVH_NODE_ID); gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; gnode_upper = ((unsigned long)gnode_extra << m_val); - printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", - n_val, m_val, gnode_upper, gnode_extra); + pr_info("UV: N:%d M:%d pnode_mask:0x%x gnode_upper/extra:0x%lx/0x%x n_lshift 0x%x\n", + n_val, m_val, pnode_mask, gnode_upper, gnode_extra, + n_lshift); - printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); + pr_info("UV: global MMR base 0x%lx\n", mmr_base); for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++) uv_possible_blades += hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8)); - printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); + + /* uv_num_possible_blades() is really the hub count */ + pr_info("UV: Found %d blades, %d hubs\n", + is_uv1_hub() ? uv_num_possible_blades() : + (uv_num_possible_blades() + 1) / 2, + uv_num_possible_blades()); bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); - uv_blade_info = kmalloc(bytes, GFP_KERNEL); + uv_blade_info = kzalloc(bytes, GFP_KERNEL); BUG_ON(!uv_blade_info); + bytes = sizeof(struct uv_blade_info_nmi) * uv_num_possible_blades(); + uv_blade_info_nmi = kzalloc(bytes, GFP_KERNEL); + BUG_ON(!uv_blade_info_nmi); + for (blade = 0; blade < uv_num_possible_blades(); blade++) uv_blade_info[blade].memory_nid = -1; @@ -594,21 +1003,40 @@ void __init uv_system_init(void) for (j = 0; j < 64; j++) { if (!test_bit(j, &present)) continue; - uv_blade_info[blade].pnode = (i * 64 + j); + pnode = (i * 64 + j) & pnode_mask; + uv_blade_info[blade].pnode = pnode; uv_blade_info[blade].nr_possible_cpus = 0; uv_blade_info[blade].nr_online_cpus = 0; + spin_lock_init(&uv_blade_info_nmi[blade].nmi_lock); + uv_blade_info_nmi[blade].nmi_count = 0; + min_pnode = min(pnode, min_pnode); + max_pnode = max(pnode, max_pnode); blade++; } } uv_bios_init(); - uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, - &sn_coherency_id, &sn_region_size); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id, + &sn_region_size, &system_serial_number); uv_rtc_init(); for_each_present_cpu(cpu) { + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + nid = cpu_to_node(cpu); - pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); + /* + * apic_pnode_shift must be set before calling uv_apicid_to_pnode(); + */ + uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; + uv_cpu_hub_info_apic_pnode_shift(cpu) = uvh_apicid.s.pnode_shift; + uv_cpu_hub_info_hub_revision(cpu) = uv_hub_info_hub_revision; + + uv_cpu_hub_info_m_shift(cpu) = 64 - m_val; + uv_cpu_hub_info_n_lshift(cpu) = n_lshift; + + for (i = 0; i < UV_HUB_INFO_EXTRA_FIELDS; i++) + uv_cpu_hub_info_extra(cpu)->future[i] = 0; + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; uv_blade_info[blade].nr_possible_cpus++; @@ -623,21 +1051,14 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; - uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; - uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; + uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid); uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; - max_pnode = max(pnode, max_pnode); - - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " - "lcpu %d, blade %d\n", - cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid, - lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ @@ -645,18 +1066,27 @@ void __init uv_system_init(void) if (uv_node_to_blade[nid] >= 0) continue; paddr = node_start_pfn(nid) << PAGE_SHIFT; - paddr = uv_soc_phys_ram_to_gpa(paddr); - pnode = (paddr >> m_val) & pnode_mask; + pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; - max_pnode = max(pnode, max_pnode); } map_gru_high(max_pnode); map_mmr_high(max_pnode); - map_mmioh_high(max_pnode); + map_mmioh_high(min_pnode, max_pnode); uv_cpu_init(); uv_scir_register_cpu_notifier(); + uv_register_nmi_notifier(); proc_mkdir("sgi_uv", NULL); + + /* register Legacy VGA I/O redirection handler */ + pci_register_set_vga_state(uv_set_vga_state); + + /* + * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as + * EFI is not enabled in the kdump kernel. + */ + if (is_kdump_kernel()) + reboot_type = BOOT_ACPI; } diff -uprN linux-2.6.32/arch/x86/kernel/apm_32.c linux-2.6.32.ovz/arch/x86/kernel/apm_32.c --- linux-2.6.32/arch/x86/kernel/apm_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/apm_32.c 2015-03-10 13:21:31.000000000 -0700 @@ -1994,8 +1994,8 @@ static int __init apm_is_horked_d850md(c apm_info.disabled = 1; printk(KERN_INFO "%s machine detected. " "Disabling APM.\n", d->ident); - printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); - printk(KERN_INFO "download from support.intel.com \n"); + printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n"); + printk(KERN_INFO "download from support.intel.com\n"); } return 0; } diff -uprN linux-2.6.32/arch/x86/kernel/bios_uv.c linux-2.6.32.ovz/arch/x86/kernel/bios_uv.c --- linux-2.6.32/arch/x86/kernel/bios_uv.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/bios_uv.c 2015-03-10 13:21:37.000000000 -0700 @@ -15,8 +15,8 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * Copyright (c) Russ Anderson + * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ #include @@ -30,6 +30,7 @@ static struct uv_systab uv_systab; s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { struct uv_systab *tab = &uv_systab; + s64 ret; if (!tab->function) /* @@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, */ return BIOS_STATUS_UNIMPLEMENTED; - return efi_call6((void *)__va(tab->function), - (u64)which, a1, a2, a3, a4, a5); + ret = efi_call6((void *)__va(tab->function), (u64)which, + a1, a2, a3, a4, a5); + return ret; } +EXPORT_SYMBOL_GPL(uv_bios_call); s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) @@ -73,11 +76,14 @@ long sn_coherency_id; EXPORT_SYMBOL_GPL(sn_coherency_id); long sn_region_size; EXPORT_SYMBOL_GPL(sn_region_size); +long system_serial_number; +EXPORT_SYMBOL_GPL(system_serial_number); int uv_type; +EXPORT_SYMBOL_GPL(uv_type); s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, - long *region) + long *region, long *ssn) { s64 ret; u64 v0, v1; @@ -97,25 +103,24 @@ s64 uv_bios_get_sn_info(int fc, int *uvt *coher = part.coherence_id; if (region) *region = part.region_size; + if (ssn) + *ssn = v1; return ret; } +EXPORT_SYMBOL_GPL(uv_bios_get_sn_info); int -uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, +uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size, unsigned long *intr_mmr_offset) { - union uv_watchlist_u size_blade; u64 watchlist; s64 ret; - size_blade.size = mq_size; - size_blade.blade = blade; - /* * bios returns watchlist number or negative error number. */ ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, - size_blade.val, (u64)intr_mmr_offset, + mq_size, (u64)intr_mmr_offset, (u64)&watchlist, 0); if (ret < BIOS_STATUS_SUCCESS) return ret; @@ -158,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u6 } EXPORT_SYMBOL_GPL(uv_bios_freq_base); +/* + * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target + * @decode: true to enable target, false to disable target + * @domain: PCI domain number + * @bus: PCI bus number + * + * Returns: + * 0: Success + * -EINVAL: Invalid domain or bus number + * -ENOSYS: Capability not available + * -EBUSY: Legacy VGA I/O cannot be retargeted at this time + */ +int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus) +{ + return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET, + (u64)decode, (u64)domain, (u64)bus, 0, 0); +} +EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target); + #ifdef CONFIG_EFI void uv_bios_init(void) @@ -189,4 +213,3 @@ void uv_bios_init(void) void uv_bios_init(void) { } #endif - diff -uprN linux-2.6.32/arch/x86/kernel/cpu/addon_cpuid_features.c linux-2.6.32.ovz/arch/x86/kernel/cpu/addon_cpuid_features.c --- linux-2.6.32/arch/x86/kernel/cpu/addon_cpuid_features.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/addon_cpuid_features.c 2015-06-23 04:16:16.000000000 -0700 @@ -14,6 +14,7 @@ struct cpuid_bit { u8 reg; u8 bit; u32 level; + u32 sub_leaf; }; enum cpuid_regs { @@ -30,9 +31,25 @@ void __cpuinit init_scattered_cpuid_feat const struct cpuid_bit *cb; static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, - { 0, 0, 0, 0 } + { X86_FEATURE_DTHERM, CR_EAX, 0, 0x00000006, 0 }, + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, + { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 }, + { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 }, + { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 }, + { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 }, + { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 }, + { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } }; for (cb = cpuid_bits; cb->feature; cb++) { @@ -43,12 +60,20 @@ void __cpuinit init_scattered_cpuid_feat max_level > (cb->level | 0xffff)) continue; - cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], - ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); } + + /* + * common AMD/Intel features + */ + if (c->cpuid_level >= 6) { + if (cpuid_ecx(6) & 0x1) + set_cpu_cap(c, X86_FEATURE_APERFMPERF); + } } /* leaf 0xb SMT level */ @@ -74,6 +99,7 @@ void __cpuinit detect_extended_topology( unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; + static bool printed; if (c->cpuid_level < 0xb) return; @@ -127,12 +153,14 @@ void __cpuinit detect_extended_topology( c->x86_max_cores = (core_level_siblings / smp_num_siblings); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); + if (!printed) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + printed = 1; + } return; #endif } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/amd.c linux-2.6.32.ovz/arch/x86/kernel/cpu/amd.c --- linux-2.6.32/arch/x86/kernel/cpu/amd.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/amd.c 2015-06-23 04:16:16.000000000 -0700 @@ -253,60 +253,55 @@ static int __cpuinit nearby_node(int api #endif /* - * Fixup core topology information for AMD multi-node processors. - * Assumption 1: Number of cores in each internal node is the same. - * Assumption 2: Mixed systems with both single-node and dual-node - * processors are not supported. + * Fixup core topology information for + * (1) AMD multi-node processors + * Assumption: Number of cores in each internal node is the same. + * (2) AMD processors supporting compute units */ #ifdef CONFIG_X86_HT -static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) +static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c) { -#ifdef CONFIG_PCI - u32 t, cpn; - u8 n, n_id; + u32 nodes, cores_per_cu = 1; + u8 node_id; int cpu = smp_processor_id(); - /* fixup topology information only once for a core */ - if (cpu_has(c, X86_FEATURE_AMD_DCM)) + /* get information required for multi-node processors */ + if (cpu_has_topoext) { + u32 eax, ebx, ecx, edx; + + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + nodes = ((ecx >> 8) & 7) + 1; + node_id = ecx & 7; + + /* get compute unit information */ + smp_num_siblings = ((ebx >> 8) & 3) + 1; + c->compute_unit_id = ebx & 0xff; + cores_per_cu += ((ebx >> 8) & 3); + } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { + u64 value; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + nodes = ((value >> 3) & 7) + 1; + node_id = value & 7; + } else return; - /* check for multi-node processor on boot cpu */ - t = read_pci_config(0, 24, 3, 0xe8); - if (!(t & (1 << 29))) - return; - - set_cpu_cap(c, X86_FEATURE_AMD_DCM); - - /* cores per node: each internal node has half the number of cores */ - cpn = c->x86_max_cores >> 1; - - /* even-numbered NB_id of this dual-node processor */ - n = c->phys_proc_id << 1; - - /* - * determine internal node id and assign cores fifty-fifty to - * each node of the dual-node processor - */ - t = read_pci_config(0, 24 + n, 3, 0xe8); - n = (t>>30) & 0x3; - if (n == 0) { - if (c->cpu_core_id < cpn) - n_id = 0; - else - n_id = 1; - } else { - if (c->cpu_core_id < cpn) - n_id = 1; - else - n_id = 0; + /* fixup multi-node processor information */ + if (nodes > 1) { + u32 cores_per_node; + u32 cus_per_node; + + set_cpu_cap(c, X86_FEATURE_AMD_DCM); + cores_per_node = c->x86_max_cores / nodes; + cus_per_node = cores_per_node / cores_per_cu; + + /* store NodeID, use llc_shared_map to store sibling info */ + per_cpu(cpu_llc_id, cpu) = node_id; + + /* core id has to be in the [0 .. cores_per_node - 1] range */ + c->cpu_core_id %= cores_per_node; + c->compute_unit_id %= cus_per_node; } - - /* compute entire NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id; - - /* fixup core id to be in range from 0 to cpn */ - c->cpu_core_id = c->cpu_core_id % cpn; -#endif } #endif @@ -327,9 +322,7 @@ static void __cpuinit amd_detect_cmp(str c->phys_proc_id = c->initial_apicid >> bits; /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; - /* fixup topology information on multi-node processors */ - if ((c->x86 == 0x10) && (c->x86_model == 9)) - amd_fixup_dcm(c); + amd_get_topology(c); #endif } @@ -375,8 +368,6 @@ static void __cpuinit srat_detect_node(s node = nearby_node(apicid); } numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } @@ -406,6 +397,21 @@ static void __cpuinit early_init_amd_mc( #endif } +static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) +{ + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + } +} + static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); @@ -437,6 +443,23 @@ static void __cpuinit early_init_amd(str set_cpu_cap(c, X86_FEATURE_EXTD_APICID); } #endif + + /* We need to do the following only once */ + if (c != &boot_cpu_data) + return; + + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -491,7 +514,7 @@ static void __cpuinit init_amd(struct cp } } - if (c->x86 == 0x10 || c->x86 == 0x11) + if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ @@ -535,7 +558,25 @@ static void __cpuinit init_amd(struct cp } } - display_cacheinfo(c); + /* re-enable TopologyExtensions if switched off by BIOS */ + if ((c->x86 == 0x15) && + (c->x86_model >= 0x10) && (c->x86_model <= 0x1f) && + !cpu_has_topoext) { + u64 val; + + if (!rdmsrl_safe(0xc0011005, &val)) { + val |= 1ULL << 54; + checking_wrmsrl(0xc0011005, val); + rdmsrl(0xc0011005, val); + if (val & (1ULL << 54)) { + set_cpu_cap(c, X86_FEATURE_TOPOEXT); + printk(KERN_INFO FW_INFO "CPU: Re-enabling " + "disabled Topology Extensions Support\n"); + } + } + } + + cpu_detect_cache_sizes(c); /* Multi core CPU? */ if (c->extended_cpuid_level >= 0x80000008) { @@ -554,7 +595,7 @@ static void __cpuinit init_amd(struct cp num_cache_leaves = 3; } - if (c->x86 >= 0xf && c->x86 <= 0x11) + if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has_xmm2) { @@ -571,7 +612,7 @@ static void __cpuinit init_amd(struct cp fam10h_check_enable_mmcfg(); } - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + if (c == &boot_cpu_data && c->x86 >= 0xf) { unsigned long long tseg; /* @@ -590,6 +631,32 @@ static void __cpuinit init_amd(struct cp } } #endif + + /* + * Family 0x12 and above processors have APIC timer + * running in deep C states. + */ + if (c->x86 > 0x11) + set_cpu_cap(c, X86_FEATURE_ARAT); + + /* + * Disable GART TLB Walk Errors on Fam10h. We do this here + * because this is always needed when GART is enabled, even in a + * kernel which has no MCE support built in. + */ + if (c->x86 == 0x10) { + /* + * BIOS should disable GartTlbWlk Errors themself. If + * it doesn't do it here as suggested by the BKDG. + * + * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 + */ + u64 mask; + + rdmsrl(MSR_AMD64_MCx_MASK(4), mask); + mask |= (1 << 10); + wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } } #ifdef CONFIG_X86_32 @@ -629,6 +696,7 @@ static const struct cpu_dev __cpuinitcon .c_size_cache = amd_size_cache, #endif .c_early_init = early_init_amd, + .c_bsp_init = bsp_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, }; diff -uprN linux-2.6.32/arch/x86/kernel/cpu/bugs.c linux-2.6.32.ovz/arch/x86/kernel/cpu/bugs.c --- linux-2.6.32/arch/x86/kernel/cpu/bugs.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/bugs.c 2015-03-10 13:23:13.000000000 -0700 @@ -86,7 +86,7 @@ static void __init check_fpu(void) static void __init check_hlt(void) { - if (paravirt_enabled()) + if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; printk(KERN_INFO "Checking 'hlt' instruction... "); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/centaur.c linux-2.6.32.ovz/arch/x86/kernel/cpu/centaur.c --- linux-2.6.32/arch/x86/kernel/cpu/centaur.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/centaur.c 2015-03-10 13:23:16.000000000 -0700 @@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpu set_cpu_cap(c, X86_FEATURE_REP_GOOD); } - display_cacheinfo(c); + cpu_detect_cache_sizes(c); } enum { diff -uprN linux-2.6.32/arch/x86/kernel/cpu/common.c linux-2.6.32.ovz/arch/x86/kernel/cpu/common.c --- linux-2.6.32/arch/x86/kernel/cpu/common.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/common.c 2015-06-23 04:16:16.000000000 -0700 @@ -13,8 +13,8 @@ #include #include -#include #include +#include #include #include #include @@ -39,12 +39,17 @@ #include #endif +#ifdef CONFIG_XEN +#include +#endif + #include "cpu.h" /* all of these masks are initialized in setup_cpu_local_masks() */ cpumask_var_t cpu_initialized_mask; cpumask_var_t cpu_callout_mask; cpumask_var_t cpu_callin_mask; +cpumask_var_t cpu_may_complete_boot_mask; /* representing cpus for which sibling maps can be computed */ cpumask_var_t cpu_sibling_setup_mask; @@ -56,12 +61,13 @@ void __init setup_cpu_local_masks(void) alloc_bootmem_cpumask_var(&cpu_callin_mask); alloc_bootmem_cpumask_var(&cpu_callout_mask); alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask); + alloc_bootmem_cpumask_var(&cpu_may_complete_boot_mask); } static void __cpuinit default_init(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 - display_cacheinfo(c); + cpu_detect_cache_sizes(c); #else /* Not much we can do here... */ /* Check if at least it has cpuid */ @@ -140,10 +146,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); return 1; } __setup("noxsave", x86_xsave_setup); +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -246,6 +260,25 @@ static inline void squash_the_stupid_ser } #endif +static int disable_smep __cpuinitdata; +static __init int setup_disable_smep(char *arg) +{ + disable_smep = 1; + return 1; +} +__setup("nosmep", setup_disable_smep); + +static __cpuinit void setup_smep(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_SMEP)) { + if (unlikely(disable_smep)) { + setup_clear_cpu_cap(X86_FEATURE_SMEP); + clear_in_cr4(X86_CR4_SMEP); + } else + set_in_cr4(X86_CR4_SMEP); + } +} + /* * Some CPU features depend on higher CPUID levels, which may not always * be available due to CPUID level capping or broken virtualization @@ -256,6 +289,53 @@ struct cpuid_dependent_feature { u32 level; }; +#ifdef CONFIG_XEN +struct xen_dangerous_feature { + u32 feature; + u32 allowed_major; +}; + +static const struct xen_dangerous_feature __cpuinitconst +xen_dangerous_cpuid_features[] = { + /* Mask out GBPAGES & RDTSCP for Xen BZ#703055 */ + { X86_FEATURE_GBPAGES, 4 }, + { X86_FEATURE_RDTSCP, 4 }, + /* Mask out features masked by BZ#712131 */ + { X86_FEATURE_MWAIT, 4 }, + /* Mask out features masked by BZ#711317 */ + { X86_FEATURE_CONSTANT_TSC, 4 }, + { X86_FEATURE_NONSTOP_TSC, 4 }, + /* Mask out features masked by BZ#752382 */ + { X86_FEATURE_SMEP, 4 }, + { 0, 0 } +}; + +static void __cpuinit fltr_xen_cpuid_features(struct cpuinfo_x86 *c, bool warn) +{ + const struct xen_dangerous_feature *df; + static u32 major = 0; + + if (!major) + major = xen_version() >> 16; + + for (df = xen_dangerous_cpuid_features; df->feature; df++) { + + if (!cpu_has(c, df->feature)) + continue; + + if (df->allowed_major && major >= df->allowed_major) + continue; + + clear_cpu_cap(c, df->feature); + if (!warn) + continue; + + pr_warning("CPU: CPU feature %s disabled on xen guest\n", + x86_cap_flags[df->feature]); + } +} +#endif /* CONFIG_XEN */ + static const struct cpuid_dependent_feature __cpuinitconst cpuid_dependent_features[] = { { X86_FEATURE_MWAIT, 0x00000005 }, @@ -292,6 +372,15 @@ static void __cpuinit filter_cpuid_featu "CPU: CPU feature %s disabled, no CPUID level 0x%x\n", x86_cap_flags[df->feature], df->level); } + +#ifdef CONFIG_XEN + /* + * RHEL Xen HVM guests must filter out additional not masked + * by old kernel-xen features, to avoid crashes. + */ + if (xen_cpuid_base() != 0) + fltr_xen_cpuid_features(c, warn); +#endif /* CONFIG_XEN */ } /* @@ -322,8 +411,8 @@ static const char *__cpuinit table_looku return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata; -__u32 cpu_caps_set[NCAPINTS] __cpuinitdata; +__u32 cpu_caps_cleared[RHNCAPINTS] __cpuinitdata; +__u32 cpu_caps_set[RHNCAPINTS] __cpuinitdata; void load_percpu_segment(int cpu) { @@ -383,7 +472,7 @@ static void __cpuinit get_model_name(str } } -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) +void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -391,8 +480,6 @@ void __cpuinit display_cacheinfo(struct if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size = (ecx>>24) + (edx>>24); #ifdef CONFIG_X86_64 /* On K8 L1 TLB is inclusive, so don't count it */ @@ -422,9 +509,6 @@ void __cpuinit display_cacheinfo(struct #endif c->x86_cache_size = l2size; - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); } void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -432,6 +516,7 @@ void __cpuinit detect_ht(struct cpuinfo_ #ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; int index_msb, core_bits; + static bool printed; if (!cpu_has(c, X86_FEATURE_HT)) return; @@ -447,7 +532,7 @@ void __cpuinit detect_ht(struct cpuinfo_ smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n"); goto out; } @@ -474,11 +559,12 @@ void __cpuinit detect_ht(struct cpuinfo_ ((1 << core_bits) - 1); out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { + if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) { printk(KERN_INFO "CPU: Physical Processor ID: %d\n", c->phys_proc_id); printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); + printed = 1; } #endif } @@ -554,6 +640,28 @@ static void __cpuinit get_cpu_cap(struct c->x86_capability[4] = excap; } + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + u32 eax, ebx, ecx, edx; + struct cpuinfo_x86_rh *rh = get_cpuinfo_x86_rh(c); + + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + + /* write into "word 9" of the rh extended capability area */ + rh->x86_capability[9 - NCAPINTS] = ebx; + } + + /* Extended state features: level 0x0000000d */ + if (c->cpuid_level >= 0x0000000d) { + u32 eax, ebx, ecx, edx; + struct cpuinfo_x86_rh *rh = get_cpuinfo_x86_rh(c); + + cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx); + + /* write into "word 10" of the rh extended capability area */ + rh->x86_capability[10 - NCAPINTS] = eax; + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -578,7 +686,6 @@ static void __cpuinit get_cpu_cap(struct if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); - } static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -618,6 +725,8 @@ static void __cpuinit identify_cpu_witho */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { + struct cpuinfo_x86_rh *rh = get_cpuinfo_x86_rh(c); + #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; c->x86_phys_bits = 36; @@ -630,6 +739,7 @@ static void __init early_identify_cpu(st c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); + memset(&rh->x86_capability, 0, sizeof rh->x86_capability); c->extended_cpuid_level = 0; if (!have_cpuid_p()) @@ -652,6 +762,11 @@ static void __init early_identify_cpu(st c->cpu_index = boot_cpu_id; #endif filter_cpuid_features(c, false); + + setup_smep(c); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); } void __init early_cpu_init(void) @@ -737,6 +852,7 @@ static void __cpuinit generic_identify(s static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; + struct cpuinfo_x86_rh *rh = get_cpuinfo_x86_rh(c); c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; @@ -758,6 +874,7 @@ static void __cpuinit identify_cpu(struc #endif c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); + memset(&rh->x86_capability, 0, sizeof rh->x86_capability); generic_identify(c); @@ -769,6 +886,10 @@ static void __cpuinit identify_cpu(struc c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; } + for (i = NCAPINTS; i < RHNCAPINTS; i++) { + rh->x86_capability[i - NCAPINTS] &= ~cpu_caps_cleared[i]; + rh->x86_capability[i - NCAPINTS] |= cpu_caps_set[i]; + } #ifdef CONFIG_X86_64 c->apicid = apic->phys_pkg_id(c->initial_apicid, 0); @@ -798,6 +919,22 @@ static void __cpuinit identify_cpu(struc /* Filter out anything that depends on CPUID levels we don't have */ filter_cpuid_features(c, true); + setup_smep(c); + + /* + * emulation of NX with segment limits unfortunately means + * we have to disable the fast system calls, due to the way that + * sysexit clears the segment limits on return. + * If we have either disabled exec-shield on the boot command line, + * or we have NX, then we don't need to do this. + */ + if (exec_shield != 0) { +#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) + if (!test_cpu_cap(c, X86_FEATURE_NX)) +#endif + clear_cpu_cap(c, X86_FEATURE_SEP); + } + /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { const char *p; @@ -815,6 +952,7 @@ static void __cpuinit identify_cpu(struc #endif init_hypervisor(c); + x86_init_rdrand(c); /* * Clear/Set all flags overriden by options, need do it @@ -824,6 +962,10 @@ static void __cpuinit identify_cpu(struc c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; } + for (i = NCAPINTS; i < RHNCAPINTS; i++) { + rh->x86_capability[i - NCAPINTS] &= ~cpu_caps_cleared[i]; + rh->x86_capability[i - NCAPINTS] |= cpu_caps_set[i]; + } /* * On SMP, boot_cpu_data holds the common feature set between @@ -835,11 +977,14 @@ static void __cpuinit identify_cpu(struc /* AND the already accumulated flags with these */ for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; + for (i = 0; i < (RHNCAPINTS - NCAPINTS); i++) + boot_cpu_data_rh.x86_capability[i] &= + rh->x86_capability[i]; } #ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ - mcheck_init(c); + mcheck_cpu_init(c); #endif select_idle_routine(c); @@ -869,7 +1014,6 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -971,7 +1115,7 @@ static __init int setup_disablecpuid(cha { int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) + if (get_option(&arg, &bit) && bit < RHNCAPINTS*32) setup_clear_cpu_cap(bit); else return 0; @@ -1115,7 +1259,7 @@ void __cpuinit cpu_init(void) if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) panic("CPU#%d already initialized!\n", cpu); - printk(KERN_INFO "Initializing CPU#%d\n", cpu); + pr_debug("Initializing CPU#%d\n", cpu); clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); @@ -1127,7 +1271,7 @@ void __cpuinit cpu_init(void) switch_to_new_gdt(cpu); loadsegment(fs, 0); - load_idt((const struct desc_ptr *)&idt_descr); + load_current_idt(); memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); @@ -1213,7 +1357,7 @@ void __cpuinit cpu_init(void) if (cpu_has_vme || cpu_has_tsc || cpu_has_de) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - load_idt(&idt_descr); + load_current_idt(); switch_to_new_gdt(cpu); /* @@ -1241,10 +1385,7 @@ void __cpuinit cpu_init(void) /* * Force FPU initialization: */ - if (cpu_has_xsave) - current_thread_info()->status = TS_XSAVE; - else - current_thread_info()->status = 0; + current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c 2015-03-10 13:22:51.000000000 -0700 @@ -33,7 +33,6 @@ #include #include #include -#include #include #include @@ -45,6 +44,7 @@ #include #include #include +#include "mperf.h" #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ "acpi-cpufreq", msg) @@ -70,8 +70,6 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); -static DEFINE_PER_CPU(struct aperfmperf, old_perf); - /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -237,45 +235,6 @@ static u32 get_cur_val(const struct cpum return cmd.val; } -/* Called via smp_call_function_single(), on the target CPU */ -static void read_measured_perf_ctrs(void *_cur) -{ - struct aperfmperf *am = _cur; - - get_aperfmperf(am); -} - -/* - * Return the measured active (C0) frequency on this CPU since last call - * to this function. - * Input: cpu number - * Return: Average CPU frequency in terms of max frequency (zero on error) - * - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance - * over a period of time, while CPU is in C0 state. - * IA32_MPERF counts at the rate of max advertised frequency - * IA32_APERF counts at the rate of actual CPU frequency - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and - * no meaning should be associated with absolute values of these MSRs. - */ -static unsigned int get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu) -{ - struct aperfmperf perf; - unsigned long ratio; - unsigned int retval; - - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) - return 0; - - ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); - per_cpu(old_perf, cpu) = perf; - - retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; - - return retval; -} - static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); @@ -361,8 +320,6 @@ static int acpi_cpufreq_target(struct cp } } - trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); - switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; @@ -698,8 +655,8 @@ static int acpi_cpufreq_cpu_init(struct acpi_processor_notify_smm(THIS_MODULE); /* Check for APERF/MPERF support in hardware */ - if (cpu_has(c, X86_FEATURE_APERFMPERF)) - acpi_cpufreq_driver.getavg = get_measured_perf; + if (boot_cpu_has(X86_FEATURE_APERFMPERF)) + acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; dprintk("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) @@ -800,7 +757,7 @@ static void __exit acpi_cpufreq_exit(voi cpufreq_unregister_driver(&acpi_cpufreq_driver); - free_percpu(acpi_perf_data); + free_acpi_perf_data(); } module_param(acpi_pstate_strict, uint, 0644); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/intel_pstate.c linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/intel_pstate.c --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/intel_pstate.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/intel_pstate.c 2015-03-10 13:24:13.000000000 -0700 @@ -0,0 +1,1048 @@ +/* + * intel_pstate.c: Native P state management for Intel processors + * + * (C) Copyright 2012 Intel Corporation + * Author: Dirk Brandewie + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define SAMPLE_COUNT 3 + +#define BYT_RATIOS 0x66a +#define BYT_VIDS 0x66b +#define BYT_TURBO_RATIOS 0x66c + + +#define FRAC_BITS 6 +#define int_tofp(X) ((int64_t)(X) << FRAC_BITS) +#define fp_toint(X) ((X) >> FRAC_BITS) +#define FP_ROUNDUP(X) ((X) += 1 << FRAC_BITS) + +static inline int32_t mul_fp(int32_t x, int32_t y) +{ + return ((int64_t)x * (int64_t)y) >> FRAC_BITS; +} + +static inline int32_t div_fp(int32_t x, int32_t y) +{ + return div_s64((int64_t)x << FRAC_BITS, (int64_t)y); +} + +struct sample { + int32_t core_pct_busy; + u64 aperf; + u64 mperf; + unsigned long long tsc; + int freq; +}; + +struct pstate_data { + int current_pstate; + int min_pstate; + int max_pstate; + int turbo_pstate; +}; + +struct vid_data { + int32_t min; + int32_t max; + int32_t ratio; +}; + +struct _pid { + int setpoint; + int32_t integral; + int32_t p_gain; + int32_t i_gain; + int32_t d_gain; + int deadband; + int32_t last_err; +}; + +struct cpudata { + int cpu; + + char name[64]; + + struct timer_list timer; + + struct pstate_data pstate; + struct vid_data vid; + struct _pid pid; + + u64 prev_aperf; + u64 prev_mperf; + unsigned long long prev_tsc; + struct sample sample; +}; + +static struct cpudata **all_cpu_data; +struct pstate_adjust_policy { + int sample_rate_ms; + int deadband; + int setpoint; + int p_gain_pct; + int d_gain_pct; + int i_gain_pct; +}; + +struct pstate_funcs { + int (*get_max)(void); + int (*get_min)(void); + int (*get_turbo)(void); + void (*set)(struct cpudata*, int pstate); + void (*get_vid)(struct cpudata *); +}; + +struct cpu_defaults { + struct pstate_adjust_policy pid_policy; + struct pstate_funcs funcs; +}; + +static struct pstate_adjust_policy pid_params; +static struct pstate_funcs pstate_funcs; + +struct perf_limits { + int no_turbo; + int max_perf_pct; + int min_perf_pct; + int32_t max_perf; + int32_t min_perf; + int max_policy_pct; + int max_sysfs_pct; +}; + +static struct perf_limits limits = { + .no_turbo = 0, + .max_perf_pct = 100, + .max_perf = int_tofp(1), + .min_perf_pct = 0, + .min_perf = 0, + .max_policy_pct = 100, + .max_sysfs_pct = 100, +}; + +static inline void pid_reset(struct _pid *pid, int setpoint, int busy, + int deadband, int integral) { + pid->setpoint = setpoint; + pid->deadband = deadband; + pid->integral = int_tofp(integral); + pid->last_err = int_tofp(setpoint) - int_tofp(busy); +} + +static inline void pid_p_gain_set(struct _pid *pid, int percent) +{ + pid->p_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static inline void pid_i_gain_set(struct _pid *pid, int percent) +{ + pid->i_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static inline void pid_d_gain_set(struct _pid *pid, int percent) +{ + + pid->d_gain = div_fp(int_tofp(percent), int_tofp(100)); +} + +static signed int pid_calc(struct _pid *pid, int32_t busy) +{ + signed int result; + int32_t pterm, dterm, fp_error; + int32_t integral_limit; + + fp_error = int_tofp(pid->setpoint) - busy; + + if (abs(fp_error) <= int_tofp(pid->deadband)) + return 0; + + pterm = mul_fp(pid->p_gain, fp_error); + + pid->integral += fp_error; + + /* limit the integral term */ + integral_limit = int_tofp(30); + if (pid->integral > integral_limit) + pid->integral = integral_limit; + if (pid->integral < -integral_limit) + pid->integral = -integral_limit; + + dterm = mul_fp(pid->d_gain, fp_error - pid->last_err); + pid->last_err = fp_error; + + result = pterm + mul_fp(pid->integral, pid->i_gain) + dterm; + + return (signed int)fp_toint(result); +} + +static inline void intel_pstate_busy_pid_reset(struct cpudata *cpu) +{ + pid_p_gain_set(&cpu->pid, pid_params.p_gain_pct); + pid_d_gain_set(&cpu->pid, pid_params.d_gain_pct); + pid_i_gain_set(&cpu->pid, pid_params.i_gain_pct); + + pid_reset(&cpu->pid, + pid_params.setpoint, + 100, + pid_params.deadband, + 0); +} + +static inline void intel_pstate_reset_all_pid(void) +{ + unsigned int cpu; + for_each_online_cpu(cpu) { + if (all_cpu_data[cpu]) + intel_pstate_busy_pid_reset(all_cpu_data[cpu]); + } +} + +/************************** debugfs begin ************************/ +static int pid_param_set(void *data, u64 val) +{ + *(u32 *)data = val; + intel_pstate_reset_all_pid(); + return 0; +} +static int pid_param_get(void *data, u64 *val) +{ + *val = *(u32 *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_pid_param, pid_param_get, + pid_param_set, "%llu\n"); + +struct pid_param { + char *name; + void *value; +}; + +static struct pid_param pid_files[] = { + {"sample_rate_ms", &pid_params.sample_rate_ms}, + {"d_gain_pct", &pid_params.d_gain_pct}, + {"i_gain_pct", &pid_params.i_gain_pct}, + {"deadband", &pid_params.deadband}, + {"setpoint", &pid_params.setpoint}, + {"p_gain_pct", &pid_params.p_gain_pct}, + {NULL, NULL} +}; + +static struct dentry *debugfs_parent; +static void intel_pstate_debug_expose_params(void) +{ + int i = 0; + + debugfs_parent = debugfs_create_dir("pstate_snb", NULL); + if (IS_ERR_OR_NULL(debugfs_parent)) + return; + while (pid_files[i].name) { + debugfs_create_file(pid_files[i].name, 0660, + debugfs_parent, pid_files[i].value, + &fops_pid_param); + i++; + } +} + +/************************** debugfs end ************************/ + +/************************** sysfs begin ************************/ +#define show_one(file_name, object) \ + static ssize_t show_##file_name \ + (struct kobject *kobj, struct attribute *attr, char *buf) \ + { \ + return sprintf(buf, "%u\n", limits.object); \ + } + +static ssize_t store_no_turbo(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + limits.no_turbo = clamp_t(int, input, 0 , 1); + + return count; +} + +static ssize_t store_max_perf_pct(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + + limits.max_sysfs_pct = clamp_t(int, input, 0 , 100); + limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct); + limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100)); + return count; +} + +static ssize_t store_min_perf_pct(struct kobject *a, struct attribute *b, + const char *buf, size_t count) +{ + unsigned int input; + int ret; + ret = sscanf(buf, "%u", &input); + if (ret != 1) + return -EINVAL; + limits.min_perf_pct = clamp_t(int, input, 0 , 100); + limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100)); + + return count; +} + +show_one(no_turbo, no_turbo); +show_one(max_perf_pct, max_perf_pct); +show_one(min_perf_pct, min_perf_pct); + +define_one_global_rw(no_turbo); +define_one_global_rw(max_perf_pct); +define_one_global_rw(min_perf_pct); + +static struct attribute *intel_pstate_attributes[] = { + &no_turbo.attr, + &max_perf_pct.attr, + &min_perf_pct.attr, + NULL +}; + +static struct attribute_group intel_pstate_attr_group = { + .attrs = intel_pstate_attributes, +}; +static struct kobject *intel_pstate_kobject; + +static void intel_pstate_sysfs_expose_params(void) +{ + int rc; + + intel_pstate_kobject = kobject_create_and_add("intel_pstate", + &cpu_sysdev_class.kset.kobj); + + BUG_ON(!intel_pstate_kobject); + rc = sysfs_create_group(intel_pstate_kobject, + &intel_pstate_attr_group); + BUG_ON(rc); +} + +/************************** sysfs end ************************/ +static int byt_get_min_pstate(void) +{ + u64 value; + rdmsrl(BYT_RATIOS, value); + return (value >> 8) & 0xFF; +} + +static int byt_get_max_pstate(void) +{ + u64 value; + rdmsrl(BYT_RATIOS, value); + return (value >> 16) & 0xFF; +} + +static int byt_get_turbo_pstate(void) +{ + u64 value; + rdmsrl(BYT_TURBO_RATIOS, value); + return value & 0x3F; +} + +static void byt_set_pstate(struct cpudata *cpudata, int pstate) +{ + u64 val; + int32_t vid_fp; + u32 vid; + + val = pstate << 8; + if (limits.no_turbo) + val |= (u64)1 << 32; + + vid_fp = cpudata->vid.min + mul_fp( + int_tofp(pstate - cpudata->pstate.min_pstate), + cpudata->vid.ratio); + + vid_fp = clamp_t(int32_t, vid_fp, cpudata->vid.min, cpudata->vid.max); + vid = fp_toint(vid_fp); + + val |= vid; + + wrmsrl(MSR_IA32_PERF_CTL, val); +} + +static void byt_get_vid(struct cpudata *cpudata) +{ + u64 value; + + rdmsrl(BYT_VIDS, value); + cpudata->vid.min = int_tofp((value >> 8) & 0x7f); + cpudata->vid.max = int_tofp((value >> 16) & 0x7f); + cpudata->vid.ratio = div_fp( + cpudata->vid.max - cpudata->vid.min, + int_tofp(cpudata->pstate.max_pstate - + cpudata->pstate.min_pstate)); +} + + +static int core_get_min_pstate(void) +{ + u64 value; + rdmsrl(MSR_PLATFORM_INFO, value); + return (value >> 40) & 0xFF; +} + +static int core_get_max_pstate(void) +{ + u64 value; + rdmsrl(MSR_PLATFORM_INFO, value); + return (value >> 8) & 0xFF; +} + +static int core_get_turbo_pstate(void) +{ + u64 value; + int nont, ret; + rdmsrl(MSR_NHM_TURBO_RATIO_LIMIT, value); + nont = core_get_max_pstate(); + ret = ((value) & 255); + if (ret <= nont) + ret = nont; + return ret; +} + +static void core_set_pstate(struct cpudata *cpudata, int pstate) +{ + u64 val; + + val = pstate << 8; + if (limits.no_turbo) + val |= (u64)1 << 32; + + wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val); +} + +static struct cpu_defaults core_params = { + .pid_policy = { + .sample_rate_ms = 10, + .deadband = 0, + .setpoint = 97, + .p_gain_pct = 20, + .d_gain_pct = 0, + .i_gain_pct = 0, + }, + .funcs = { + .get_max = core_get_max_pstate, + .get_min = core_get_min_pstate, + .get_turbo = core_get_turbo_pstate, + .set = core_set_pstate, + }, +}; + +static struct cpu_defaults byt_params = { + .pid_policy = { + .sample_rate_ms = 10, + .deadband = 0, + .setpoint = 97, + .p_gain_pct = 14, + .d_gain_pct = 0, + .i_gain_pct = 4, + }, + .funcs = { + .get_max = byt_get_max_pstate, + .get_min = byt_get_min_pstate, + .get_turbo = byt_get_turbo_pstate, + .set = byt_set_pstate, + .get_vid = byt_get_vid, + }, +}; + + +static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) +{ + int max_perf = cpu->pstate.turbo_pstate; + int max_perf_adj; + int min_perf; + if (limits.no_turbo) + max_perf = cpu->pstate.max_pstate; + + max_perf_adj = fp_toint(mul_fp(int_tofp(max_perf), limits.max_perf)); + *max = clamp_t(int, max_perf_adj, + cpu->pstate.min_pstate, cpu->pstate.turbo_pstate); + + min_perf = fp_toint(mul_fp(int_tofp(max_perf), limits.min_perf)); + *min = clamp_t(int, min_perf, + cpu->pstate.min_pstate, max_perf); +} + +static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate) +{ + int max_perf, min_perf; + + intel_pstate_get_min_max(cpu, &min_perf, &max_perf); + + pstate = clamp_t(int, pstate, min_perf, max_perf); + + if (pstate == cpu->pstate.current_pstate) + return; + +#ifndef MODULE + trace_cpu_frequency(pstate * 100000, cpu->cpu); +#endif + cpu->pstate.current_pstate = pstate; + + pstate_funcs.set(cpu, pstate); +} + +static inline void intel_pstate_pstate_increase(struct cpudata *cpu, int steps) +{ + int target; + target = cpu->pstate.current_pstate + steps; + + intel_pstate_set_pstate(cpu, target); +} + +static inline void intel_pstate_pstate_decrease(struct cpudata *cpu, int steps) +{ + int target; + target = cpu->pstate.current_pstate - steps; + intel_pstate_set_pstate(cpu, target); +} + +static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) +{ + sprintf(cpu->name, "Intel 2nd generation core"); + + cpu->pstate.min_pstate = pstate_funcs.get_min(); + cpu->pstate.max_pstate = pstate_funcs.get_max(); + cpu->pstate.turbo_pstate = pstate_funcs.get_turbo(); + + if (pstate_funcs.get_vid) + pstate_funcs.get_vid(cpu); + + /* + * goto max pstate so we don't slow up boot if we are built-in if we are + * a module we will take care of it during normal operation + */ + intel_pstate_set_pstate(cpu, cpu->pstate.max_pstate); +} + +static inline void intel_pstate_calc_busy(struct cpudata *cpu, + struct sample *sample) +{ + int32_t core_pct; + int32_t c0_pct; + + core_pct = div_fp(int_tofp((sample->aperf)), + int_tofp((sample->mperf))); + core_pct = mul_fp(core_pct, int_tofp(100)); + FP_ROUNDUP(core_pct); + + c0_pct = div_fp(int_tofp(sample->mperf), int_tofp(sample->tsc)); + + sample->freq = fp_toint( + mul_fp(int_tofp(cpu->pstate.max_pstate * 1000), core_pct)); + + sample->core_pct_busy = mul_fp(core_pct, c0_pct); +} + +static inline void intel_pstate_sample(struct cpudata *cpu) +{ + u64 aperf, mperf; + unsigned long long tsc; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + tsc = native_read_tsc(); + + aperf = aperf >> FRAC_BITS; + mperf = mperf >> FRAC_BITS; + tsc = tsc >> FRAC_BITS; + + cpu->sample.aperf = aperf; + cpu->sample.mperf = mperf; + cpu->sample.tsc = tsc; + cpu->sample.aperf -= cpu->prev_aperf; + cpu->sample.mperf -= cpu->prev_mperf; + cpu->sample.tsc -= cpu->prev_tsc; + + intel_pstate_calc_busy(cpu, &cpu->sample); + + cpu->prev_aperf = aperf; + cpu->prev_mperf = mperf; + cpu->prev_tsc = tsc; +} + +static inline void intel_pstate_set_sample_time(struct cpudata *cpu) +{ + int sample_time, delay; + + sample_time = pid_params.sample_rate_ms; + delay = msecs_to_jiffies(sample_time); + mod_timer_pinned(&cpu->timer, jiffies + delay); +} + +static inline int32_t intel_pstate_get_scaled_busy(struct cpudata *cpu) +{ + int32_t core_busy, max_pstate, current_pstate; + + core_busy = cpu->sample.core_pct_busy; + max_pstate = int_tofp(cpu->pstate.max_pstate); + current_pstate = int_tofp(cpu->pstate.current_pstate); + core_busy = mul_fp(core_busy, div_fp(max_pstate, current_pstate)); + return FP_ROUNDUP(core_busy); +} + +static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) +{ + int32_t busy_scaled; + struct _pid *pid; + signed int ctl = 0; + int steps; + + pid = &cpu->pid; + busy_scaled = intel_pstate_get_scaled_busy(cpu); + + ctl = pid_calc(pid, busy_scaled); + + steps = abs(ctl); + + if (ctl < 0) + intel_pstate_pstate_increase(cpu, steps); + else + intel_pstate_pstate_decrease(cpu, steps); +} + +static void intel_pstate_timer_func(unsigned long __data) +{ + struct cpudata *cpu = (struct cpudata *) __data; + struct sample *sample; + + intel_pstate_sample(cpu); + + sample = &cpu->sample; + + intel_pstate_adjust_busy_pstate(cpu); + +#ifndef MODULE + trace_pstate_sample(fp_toint(sample->core_pct_busy), + fp_toint(intel_pstate_get_scaled_busy(cpu)), + cpu->pstate.current_pstate, + sample->mperf, + sample->aperf, + sample->freq); +#endif + + intel_pstate_set_sample_time(cpu); +} + +#define ICPU(model, policy) \ + { X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF,\ + (unsigned long)&policy } + +static const struct x86_cpu_id intel_pstate_cpu_ids[] = { + ICPU(0x2a, core_params), + ICPU(0x2d, core_params), + ICPU(0x37, byt_params), + ICPU(0x3a, core_params), + ICPU(0x3c, core_params), + ICPU(0x3d, core_params), + ICPU(0x3e, core_params), + ICPU(0x3f, core_params), + ICPU(0x45, core_params), + ICPU(0x46, core_params), + ICPU(0x4f, core_params), + ICPU(0x56, core_params), + {} +}; +MODULE_DEVICE_TABLE(x86cpu, intel_pstate_cpu_ids); + +static int intel_pstate_init_cpu(unsigned int cpunum) +{ + + const struct x86_cpu_id *id; + struct cpudata *cpu; + + id = x86_match_cpu(intel_pstate_cpu_ids); + if (!id) + return -ENODEV; + + all_cpu_data[cpunum] = kzalloc(sizeof(struct cpudata), GFP_KERNEL); + if (!all_cpu_data[cpunum]) + return -ENOMEM; + + cpu = all_cpu_data[cpunum]; + + intel_pstate_get_cpu_pstates(cpu); + if (!cpu->pstate.current_pstate) { + all_cpu_data[cpunum] = NULL; + kfree(cpu); + return -ENODATA; + } + + cpu->cpu = cpunum; + + init_timer_deferrable(&cpu->timer); + cpu->timer.function = intel_pstate_timer_func; + cpu->timer.data = + (unsigned long)cpu; + cpu->timer.expires = jiffies + HZ/100; + intel_pstate_busy_pid_reset(cpu); + intel_pstate_sample(cpu); + intel_pstate_set_pstate(cpu, cpu->pstate.max_pstate); + + add_timer_on(&cpu->timer, cpunum); + + pr_info("Intel pstate controlling: cpu %d\n", cpunum); + + return 0; +} + +static unsigned int intel_pstate_get(unsigned int cpu_num) +{ + struct sample *sample; + struct cpudata *cpu; + + cpu = all_cpu_data[cpu_num]; + if (!cpu) + return 0; + sample = &cpu->sample; + return sample->freq; +} + +static int intel_pstate_set_policy(struct cpufreq_policy *policy) +{ + struct cpudata *cpu; + + cpu = all_cpu_data[policy->cpu]; + + if (!policy->cpuinfo.max_freq) + return -ENODEV; + + if (policy->policy == CPUFREQ_POLICY_PERFORMANCE) { + limits.min_perf_pct = 100; + limits.min_perf = int_tofp(1); + limits.max_perf_pct = 100; + limits.max_perf = int_tofp(1); + limits.no_turbo = 0; + return 0; + } + limits.min_perf_pct = (policy->min * 100) / policy->cpuinfo.max_freq; + limits.min_perf_pct = clamp_t(int, limits.min_perf_pct, 0 , 100); + limits.min_perf = div_fp(int_tofp(limits.min_perf_pct), int_tofp(100)); + + limits.max_policy_pct = policy->max * 100 / policy->cpuinfo.max_freq; + limits.max_policy_pct = clamp_t(int, limits.max_policy_pct, 0 , 100); + limits.max_perf_pct = min(limits.max_policy_pct, limits.max_sysfs_pct); + limits.max_perf = div_fp(int_tofp(limits.max_perf_pct), int_tofp(100)); + + return 0; +} + +static int intel_pstate_verify_policy(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_cpu_limits(policy); + + if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) && + (policy->policy != CPUFREQ_POLICY_PERFORMANCE)) + return -EINVAL; + + return 0; +} + +static int __cpuinit intel_pstate_stop_cpu(struct cpufreq_policy *policy) +{ + int cpu_num = policy->cpu; + struct cpudata *cpu = all_cpu_data[cpu_num]; + + pr_info("intel_pstate CPU %d exiting\n", cpu_num); + + del_timer_sync(&all_cpu_data[cpu_num]->timer); + intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate); + kfree(all_cpu_data[cpu_num]); + all_cpu_data[cpu_num] = NULL; + + return 0; +} + +static int __cpuinit intel_pstate_cpu_init(struct cpufreq_policy *policy) +{ + struct cpudata *cpu; + int rc; + + rc = intel_pstate_init_cpu(policy->cpu); + if (rc) + return rc; + + cpu = all_cpu_data[policy->cpu]; + + if (!limits.no_turbo && + limits.min_perf_pct == 100 && limits.max_perf_pct == 100) + policy->policy = CPUFREQ_POLICY_PERFORMANCE; + else + policy->policy = CPUFREQ_POLICY_POWERSAVE; + + policy->min = cpu->pstate.min_pstate * 100000; + policy->max = cpu->pstate.turbo_pstate * 100000; + + /* cpuinfo and default policy values */ + policy->cpuinfo.min_freq = cpu->pstate.min_pstate * 100000; + policy->cpuinfo.max_freq = cpu->pstate.turbo_pstate * 100000; + policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; + cpumask_set_cpu(policy->cpu, policy->cpus); + + return 0; +} + +static struct cpufreq_driver intel_pstate_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .verify = intel_pstate_verify_policy, + .setpolicy = intel_pstate_set_policy, + .get = intel_pstate_get, + .init = intel_pstate_cpu_init, + .exit = intel_pstate_stop_cpu, + .name = "intel_pstate", + .owner = THIS_MODULE, +}; + +static void intel_pstate_exit(void) +{ + int cpu; + + sysfs_remove_group(intel_pstate_kobject, + &intel_pstate_attr_group); + kobject_put(intel_pstate_kobject); + debugfs_remove_recursive(debugfs_parent); + + cpufreq_unregister_driver(&intel_pstate_driver); + + if (!all_cpu_data) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if (all_cpu_data[cpu]) { + del_timer_sync(&all_cpu_data[cpu]->timer); + kfree(all_cpu_data[cpu]); + } + } + + put_online_cpus(); + vfree(all_cpu_data); +} +module_exit(intel_pstate_exit); + +static int __initdata no_load; + +static int intel_pstate_msrs_not_valid(void) +{ + /* Check that all the msr's we are using are valid. */ + u64 aperf, mperf, tmp; + + rdmsrl(MSR_IA32_APERF, aperf); + rdmsrl(MSR_IA32_MPERF, mperf); + + if (!pstate_funcs.get_max() || + !pstate_funcs.get_min() || + !pstate_funcs.get_turbo()) + return -ENODEV; + + rdmsrl(MSR_IA32_APERF, tmp); + if (!(tmp - aperf)) + return -ENODEV; + + rdmsrl(MSR_IA32_MPERF, tmp); + if (!(tmp - mperf)) + return -ENODEV; + + return 0; +} + +static void copy_pid_params(struct pstate_adjust_policy *policy) +{ + pid_params.sample_rate_ms = policy->sample_rate_ms; + pid_params.p_gain_pct = policy->p_gain_pct; + pid_params.i_gain_pct = policy->i_gain_pct; + pid_params.d_gain_pct = policy->d_gain_pct; + pid_params.deadband = policy->deadband; + pid_params.setpoint = policy->setpoint; +} + +static void copy_cpu_funcs(struct pstate_funcs *funcs) +{ + pstate_funcs.get_max = funcs->get_max; + pstate_funcs.get_min = funcs->get_min; + pstate_funcs.get_turbo = funcs->get_turbo; + pstate_funcs.set = funcs->set; + pstate_funcs.get_vid = funcs->get_vid; +} + +#if IS_ENABLED(CONFIG_ACPI) +#include + +static bool intel_pstate_no_acpi_pss(void) +{ + int i; + + for_each_possible_cpu(i) { + acpi_status status; + union acpi_object *pss; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_processor *pr = per_cpu(processors, i); + + if (!pr) + continue; + + status = acpi_evaluate_object(pr->handle, "_PSS", NULL, &buffer); + if (ACPI_FAILURE(status)) + continue; + + pss = buffer.pointer; + if (pss && pss->type == ACPI_TYPE_PACKAGE) { + kfree(pss); + return false; + } + + kfree(pss); + } + + return true; +} + +struct hw_vendor_info { + u16 valid; + char oem_id[ACPI_OEM_ID_SIZE]; + char oem_table_id[ACPI_OEM_TABLE_ID_SIZE]; +}; + +/* Hardware vendor-specific info that has its own power management modes */ +static struct hw_vendor_info vendor_info[] = { + {1, "HP ", "ProLiant"}, + {0, "", ""}, +}; + +static bool intel_pstate_platform_pwr_mgmt_exists(void) +{ + struct acpi_table_header hdr; + struct hw_vendor_info *v_info; + + if (acpi_disabled + || ACPI_FAILURE(acpi_get_table_header(ACPI_SIG_FADT, 0, &hdr))) + return false; + + for (v_info = vendor_info; v_info->valid; v_info++) { + if (!strncmp(hdr.oem_id, v_info->oem_id, ACPI_OEM_ID_SIZE) + && !strncmp(hdr.oem_table_id, v_info->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) + && intel_pstate_no_acpi_pss()) + return true; + } + + return false; +} +#else /* CONFIG_ACPI not enabled */ +static inline bool intel_pstate_platform_pwr_mgmt_exists(void) { return false; } +#endif /* CONFIG_ACPI */ + +static int __init intel_pstate_init(void) +{ + int cpu, rc = 0; + const struct x86_cpu_id *id; + struct cpu_defaults *cpu_info; + + if (no_load) + return -ENODEV; + + id = x86_match_cpu(intel_pstate_cpu_ids); + if (!id) + return -ENODEV; + + /* + * The Intel pstate driver will be ignored if the platform + * firmware has its own power management modes. + */ + if (intel_pstate_platform_pwr_mgmt_exists()) + return -ENODEV; + + cpu_info = (struct cpu_defaults *)id->driver_data; + + copy_pid_params(&cpu_info->pid_policy); + copy_cpu_funcs(&cpu_info->funcs); + + if (intel_pstate_msrs_not_valid()) + return -ENODEV; + + pr_info("Intel P-state driver initializing.\n"); + + all_cpu_data = vzalloc(sizeof(void *) * num_possible_cpus()); + if (!all_cpu_data) + return -ENOMEM; + + rc = cpufreq_register_driver(&intel_pstate_driver); + if (rc) + goto out; + + intel_pstate_debug_expose_params(); + intel_pstate_sysfs_expose_params(); + + return rc; +out: + get_online_cpus(); + for_each_online_cpu(cpu) { + if (all_cpu_data[cpu]) { + del_timer_sync(&all_cpu_data[cpu]->timer); + kfree(all_cpu_data[cpu]); + } + } + + put_online_cpus(); + vfree(all_cpu_data); + return -ENODEV; +} +device_initcall(intel_pstate_init); + +#if 0 /* RHEL 6 -- intel_pstate is not built-in, so kernel param to + * disable it is not applicable and won't build as is. + */ +static int __init intel_pstate_setup(char *str) +{ + if (!str) + return -EINVAL; + + if (!strcmp(str, "disable")) + no_load = 1; + return 0; +} +early_param("intel_pstate", intel_pstate_setup); +#endif + +MODULE_AUTHOR("Dirk Brandewie "); +MODULE_DESCRIPTION("'intel_pstate' - P state driver Intel Core processors"); +MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/Kconfig linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/Kconfig --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/Kconfig 2015-03-10 13:24:00.000000000 -0700 @@ -10,6 +10,39 @@ if CPU_FREQ comment "CPUFreq processor drivers" +config X86_INTEL_PSTATE + tristate "Intel P state control" + depends on X86 + help + This driver provides a P state for Intel core processors. + The driver implements an internal governor and will become + the scaling driver and governor for Sandy bridge processors. + + When this driver is enabled it will become the perferred + scaling driver for Sandy bridge processors. + + Note: This driver should be built with the same settings as + the other scaling drivers configured into the system + (module/built-in) in order for the driver to register itself + as the scaling driver on the system. + + If in doubt, say N. + + +config X86_PCC_CPUFREQ + tristate "Processor Clocking Control interface driver" + depends on ACPI && ACPI_PROCESSOR + help + This driver adds support for the PCC interface. + + For details, take a look at: + . + + To compile this driver as a module, choose M here: the + module will be called pcc-cpufreq. + + If in doubt, say N. + config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/Makefile linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/Makefile --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/Makefile 2015-03-10 13:24:00.000000000 -0700 @@ -2,8 +2,9 @@ # K8 systems. ACPI is preferred to all other hardware-specific drivers. # speedstep-* is preferred over p4-clockmod. -obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o +obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o +obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o obj-$(CONFIG_X86_LONGHAUL) += longhaul.o @@ -18,3 +19,4 @@ obj-$(CONFIG_X86_SPEEDSTEP_SMI) += spee obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o +obj-$(CONFIG_X86_INTEL_PSTATE) += intel_pstate.o diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/mperf.c linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/mperf.c --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/mperf.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/mperf.c 2015-03-10 13:22:21.000000000 -0700 @@ -0,0 +1,51 @@ +#include +#include +#include +#include +#include +#include + +#include "mperf.h" + +static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); + +/* Called via smp_call_function_single(), on the target CPU */ +static void read_measured_perf_ctrs(void *_cur) +{ + struct aperfmperf *am = _cur; + + get_aperfmperf(am); +} + +/* + * Return the measured active (C0) frequency on this CPU since last call + * to this function. + * Input: cpu number + * Return: Average CPU frequency in terms of max frequency (zero on error) + * + * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance + * over a period of time, while CPU is in C0 state. + * IA32_MPERF counts at the rate of max advertised frequency + * IA32_APERF counts at the rate of actual CPU frequency + * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and + * no meaning should be associated with absolute values of these MSRs. + */ +unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu) +{ + struct aperfmperf perf; + unsigned long ratio; + unsigned int retval; + + if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) + return 0; + + ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); + per_cpu(acfreq_old_perf, cpu) = perf; + + retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; + + return retval; +} +EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); +MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/mperf.h linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/mperf.h --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/mperf.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/mperf.h 2015-03-10 13:22:21.000000000 -0700 @@ -0,0 +1,9 @@ +/* + * (c) 2010 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + */ + +unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c 2015-03-10 13:23:11.000000000 -0700 @@ -159,9 +159,9 @@ static unsigned int cpufreq_p4_get_frequ { if (c->x86 == 0x06) { if (cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Warning: EST-capable CPU " - "detected. The acpi-cpufreq module offers " - "voltage scaling in addition of frequency " + printk_once(KERN_WARNING PFX "Warning: EST-capable " + "CPU detected. The acpi-cpufreq module offers " + "voltage scaling in addition to frequency " "scaling. You should use that instead of " "p4-clockmod, if possible.\n"); switch (c->x86_model) { @@ -179,13 +179,8 @@ static unsigned int cpufreq_p4_get_frequ } } - if (c->x86 != 0xF) { - if (!cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Unknown CPU. " - "Please send an e-mail to " - "\n"); + if (c->x86 != 0xF) return 0; - } /* on P-4s, the TSC runs with constant frequency independent whether * throttling is active or not. */ diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,628 @@ +/* + * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface + * + * Copyright (C) 2009 Red Hat, Matthew Garrett + * Copyright (C) 2009 Hewlett-Packard Development Company, L.P. + * Nagananda Chumbalkar + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON + * INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 675 Mass Ave, Cambridge, MA 02139, USA. + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#define PCC_VERSION "1.00.00" +#define POLL_LOOPS 300 + +#define CMD_COMPLETE 0x1 +#define CMD_GET_FREQ 0x0 +#define CMD_SET_FREQ 0x1 + +#define BUF_SZ 4 + +#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ + "pcc-cpufreq", msg) + +struct pcc_register_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +} __attribute__ ((packed)); + +struct pcc_memory_resource { + u8 descriptor; + u16 length; + u8 space_id; + u8 resource_usage; + u8 type_specific; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; +} __attribute__ ((packed)); + +static struct cpufreq_driver pcc_cpufreq_driver; + +struct pcc_header { + u32 signature; + u16 length; + u8 major; + u8 minor; + u32 features; + u16 command; + u16 status; + u32 latency; + u32 minimum_time; + u32 maximum_time; + u32 nominal; + u32 throttled_frequency; + u32 minimum_frequency; +}; + +static void __iomem *pcch_virt_addr; +static struct pcc_header __iomem *pcch_hdr; + +static DEFINE_SPINLOCK(pcc_lock); + +static struct acpi_generic_address doorbell; + +static u64 doorbell_preserve; +static u64 doorbell_write; + +static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f, + 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46}; + +struct pcc_cpu { + u32 input_offset; + u32 output_offset; +}; + +static struct pcc_cpu *pcc_cpu_info; + +static int pcc_cpufreq_verify(struct cpufreq_policy *policy) +{ + cpufreq_verify_within_cpu_limits(policy); + return 0; +} + +static inline void pcc_cmd(void) +{ + u64 doorbell_value; + int i; + + acpi_read(&doorbell_value, &doorbell); + acpi_write((doorbell_value & doorbell_preserve) | doorbell_write, + &doorbell); + + for (i = 0; i < POLL_LOOPS; i++) { + if (ioread16(&pcch_hdr->status) & CMD_COMPLETE) + break; + } +} + +static inline void pcc_clear_mapping(void) +{ + if (pcch_virt_addr) + iounmap(pcch_virt_addr); + pcch_virt_addr = NULL; +} + +static unsigned int pcc_get_freq(unsigned int cpu) +{ + struct pcc_cpu *pcc_cpu_data; + unsigned int curr_freq; + unsigned int freq_limit; + u16 status; + u32 input_buffer; + u32 output_buffer; + + spin_lock(&pcc_lock); + + dprintk("get: get_freq for CPU %d\n", cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + input_buffer = 0x1; + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_GET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + output_buffer = + ioread32(pcch_virt_addr + pcc_cpu_data->output_offset); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("get: FAILED: for CPU %d, status is %d\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff)) + / 100) * 1000); + + dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is " + "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n", + cpu, (pcch_virt_addr + pcc_cpu_data->output_offset), + output_buffer, curr_freq); + + freq_limit = (output_buffer >> 8) & 0xff; + if (freq_limit != 0xff) { + dprintk("get: frequency for cpu %d is being temporarily" + " capped at %d\n", cpu, curr_freq); + } + + spin_unlock(&pcc_lock); + return curr_freq; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_cpufreq_target(struct cpufreq_policy *policy, + unsigned int target_freq, + unsigned int relation) +{ + struct pcc_cpu *pcc_cpu_data; + struct cpufreq_freqs freqs; + u16 status; + u32 input_buffer; + int cpu; + + spin_lock(&pcc_lock); + cpu = policy->cpu; + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + dprintk("target: CPU %d should go to target freq: %d " + "(virtual) input_offset is 0x%x\n", + cpu, target_freq, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + + freqs.new = target_freq; + freqs.cpu = cpu; + cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); + + input_buffer = 0x1 | (((target_freq * 100) + / (ioread32(&pcch_hdr->nominal) * 1000)) << 8); + iowrite32(input_buffer, + (pcch_virt_addr + pcc_cpu_data->input_offset)); + iowrite16(CMD_SET_FREQ, &pcch_hdr->command); + + pcc_cmd(); + + /* Clear the input buffer - we are done with the current command */ + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + + status = ioread16(&pcch_hdr->status); + if (status != CMD_COMPLETE) { + dprintk("target: FAILED for cpu %d, with status: 0x%x\n", + cpu, status); + goto cmd_incomplete; + } + iowrite16(0, &pcch_hdr->status); + + cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); + dprintk("target: was SUCCESSFUL for cpu %d\n", cpu); + spin_unlock(&pcc_lock); + + return 0; + +cmd_incomplete: + iowrite16(0, &pcch_hdr->status); + spin_unlock(&pcc_lock); + return -EINVAL; +} + +static int pcc_get_offset(int cpu) +{ + acpi_status status; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object *pccp, *offset; + struct pcc_cpu *pcc_cpu_data; + struct acpi_processor *pr; + int ret = 0; + + pr = per_cpu(processors, cpu); + pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu); + + if (!pr) + return -ENODEV; + + status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; + + pccp = buffer.pointer; + if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + }; + + offset = &(pccp->package.elements[0]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->input_offset = offset->integer.value; + + offset = &(pccp->package.elements[1]); + if (!offset || offset->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto out_free; + } + + pcc_cpu_data->output_offset = offset->integer.value; + + memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ); + memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ); + + dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data " + "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n", + cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset); +out_free: + kfree(buffer.pointer); + return ret; +} + +static int __init pcc_cpufreq_do_osc(acpi_handle *handle) +{ + acpi_status status; + struct acpi_object_list input; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + union acpi_object in_params[4]; + union acpi_object *out_obj; + u32 capabilities[2]; + u32 errors; + u32 supported; + int ret = 0; + + input.count = 4; + input.pointer = in_params; + input.count = 4; + input.pointer = in_params; + in_params[0].type = ACPI_TYPE_BUFFER; + in_params[0].buffer.length = 16; + in_params[0].buffer.pointer = OSC_UUID; + in_params[1].type = ACPI_TYPE_INTEGER; + in_params[1].integer.value = 1; + in_params[2].type = ACPI_TYPE_INTEGER; + in_params[2].integer.value = 2; + in_params[3].type = ACPI_TYPE_BUFFER; + in_params[3].buffer.length = 8; + in_params[3].buffer.pointer = (u8 *)&capabilities; + + capabilities[0] = OSC_QUERY_ENABLE; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + + kfree(output.pointer); + capabilities[0] = 0x0; + capabilities[1] = 0x1; + + status = acpi_evaluate_object(*handle, "_OSC", &input, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + if (!output.length) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); + if (errors) { + ret = -ENODEV; + goto out_free; + } + + supported = *((u32 *)(out_obj->buffer.pointer + 4)); + if (!(supported & 0x1)) { + ret = -ENODEV; + goto out_free; + } + +out_free: + kfree(output.pointer); + return ret; +} + +static int __init pcc_cpufreq_probe(void) +{ + acpi_status status; + struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; + struct pcc_memory_resource *mem_resource; + struct pcc_register_resource *reg_resource; + union acpi_object *out_obj, *member; + acpi_handle handle, osc_handle, pcch_handle; + int ret = 0; + + status = acpi_get_handle(NULL, "\\_SB", &handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "PCCH", &pcch_handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + + status = acpi_get_handle(handle, "_OSC", &osc_handle); + if (ACPI_SUCCESS(status)) { + ret = pcc_cpufreq_do_osc(&osc_handle); + if (ret) + dprintk("probe: _OSC evaluation did not succeed\n"); + /* Firmware's use of _OSC is optional */ + ret = 0; + } + + status = acpi_evaluate_object(handle, "PCCH", NULL, &output); + if (ACPI_FAILURE(status)) + return -ENODEV; + + out_obj = output.pointer; + if (out_obj->type != ACPI_TYPE_PACKAGE) { + ret = -ENODEV; + goto out_free; + } + + member = &out_obj->package.elements[0]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto out_free; + } + + mem_resource = (struct pcc_memory_resource *)member->buffer.pointer; + + dprintk("probe: mem_resource descriptor: 0x%x," + " length: %d, space_id: %d, resource_usage: %d," + " type_specific: %d, granularity: 0x%llx," + " minimum: 0x%llx, maximum: 0x%llx," + " translation_offset: 0x%llx, address_length: 0x%llx\n", + mem_resource->descriptor, mem_resource->length, + mem_resource->space_id, mem_resource->resource_usage, + mem_resource->type_specific, mem_resource->granularity, + mem_resource->minimum, mem_resource->maximum, + mem_resource->translation_offset, + mem_resource->address_length); + + if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) { + ret = -ENODEV; + goto out_free; + } + + pcch_virt_addr = ioremap_nocache(mem_resource->minimum, + mem_resource->address_length); + if (pcch_virt_addr == NULL) { + dprintk("probe: could not map shared mem region\n"); + goto out_free; + } + pcch_hdr = pcch_virt_addr; + + dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr); + dprintk("probe: PCCH header is at physical address: 0x%llx," + " signature: 0x%x, length: %d bytes, major: %d, minor: %d," + " supported features: 0x%x, command field: 0x%x," + " status field: 0x%x, nominal latency: %d us\n", + mem_resource->minimum, ioread32(&pcch_hdr->signature), + ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major), + ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features), + ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status), + ioread32(&pcch_hdr->latency)); + + dprintk("probe: min time between commands: %d us," + " max time between commands: %d us," + " nominal CPU frequency: %d MHz," + " minimum CPU frequency: %d MHz," + " minimum CPU frequency without throttling: %d MHz\n", + ioread32(&pcch_hdr->minimum_time), + ioread32(&pcch_hdr->maximum_time), + ioread32(&pcch_hdr->nominal), + ioread32(&pcch_hdr->throttled_frequency), + ioread32(&pcch_hdr->minimum_frequency)); + + member = &out_obj->package.elements[1]; + if (member->type != ACPI_TYPE_BUFFER) { + ret = -ENODEV; + goto pcch_free; + } + + reg_resource = (struct pcc_register_resource *)member->buffer.pointer; + + doorbell.space_id = reg_resource->space_id; + doorbell.bit_width = reg_resource->bit_width; + doorbell.bit_offset = reg_resource->bit_offset; + doorbell.access_width = 64; + doorbell.address = reg_resource->address; + + dprintk("probe: doorbell: space_id is %d, bit_width is %d, " + "bit_offset is %d, access_width is %d, address is 0x%llx\n", + doorbell.space_id, doorbell.bit_width, doorbell.bit_offset, + doorbell.access_width, reg_resource->address); + + member = &out_obj->package.elements[2]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_preserve = member->integer.value; + + member = &out_obj->package.elements[3]; + if (member->type != ACPI_TYPE_INTEGER) { + ret = -ENODEV; + goto pcch_free; + } + + doorbell_write = member->integer.value; + + dprintk("probe: doorbell_preserve: 0x%llx," + " doorbell_write: 0x%llx\n", + doorbell_preserve, doorbell_write); + + pcc_cpu_info = alloc_percpu(struct pcc_cpu); + if (!pcc_cpu_info) { + ret = -ENOMEM; + goto pcch_free; + } + + printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency" + " limits: %d MHz, %d MHz\n", PCC_VERSION, + ioread32(&pcch_hdr->minimum_frequency), + ioread32(&pcch_hdr->nominal)); + kfree(output.pointer); + return ret; +pcch_free: + pcc_clear_mapping(); +out_free: + kfree(output.pointer); + return ret; +} + +static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) +{ + unsigned int cpu = policy->cpu; + unsigned int result = 0; + + if (!pcch_virt_addr) { + result = -1; + goto out; + } + + result = pcc_get_offset(cpu); + if (result) { + dprintk("init: PCCP evaluation failed\n"); + goto out; + } + + policy->max = policy->cpuinfo.max_freq = + ioread32(&pcch_hdr->nominal) * 1000; + policy->min = policy->cpuinfo.min_freq = + ioread32(&pcch_hdr->minimum_frequency) * 1000; + policy->cur = pcc_get_freq(cpu); + + if (!policy->cur) { + dprintk("init: Unable to get current CPU frequency\n"); + result = -EINVAL; + goto out; + } + + dprintk("init: policy->max is %d, policy->min is %d\n", + policy->max, policy->min); +out: + return result; +} + +static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy) +{ + return 0; +} + +static struct cpufreq_driver pcc_cpufreq_driver = { + .flags = CPUFREQ_CONST_LOOPS, + .get = pcc_get_freq, + .verify = pcc_cpufreq_verify, + .target = pcc_cpufreq_target, + .init = pcc_cpufreq_cpu_init, + .exit = pcc_cpufreq_cpu_exit, + .name = "pcc-cpufreq", + .owner = THIS_MODULE, +}; + +static int __init pcc_cpufreq_init(void) +{ + int ret; + + if (acpi_disabled) + return 0; + + ret = pcc_cpufreq_probe(); + if (ret) { + dprintk("pcc_cpufreq_init: PCCH evaluation failed\n"); + return ret; + } + + ret = cpufreq_register_driver(&pcc_cpufreq_driver); + + return ret; +} + +static void __exit pcc_cpufreq_exit(void) +{ + cpufreq_unregister_driver(&pcc_cpufreq_driver); + + pcc_clear_mapping(); + + free_percpu(pcc_cpu_info); +} + +MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar"); +MODULE_VERSION(PCC_VERSION); +MODULE_DESCRIPTION("Processor Clocking Control interface driver"); +MODULE_LICENSE("GPL"); + +late_initcall(pcc_cpufreq_init); +module_exit(pcc_cpufreq_exit); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/powernow-k8.c linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/powernow-k8.c --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/powernow-k8.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/powernow-k8.c 2015-03-10 13:22:55.000000000 -0700 @@ -1,6 +1,5 @@ - /* - * (c) 2003-2006 Advanced Micro Devices, Inc. + * (c) 2003-2010 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -46,6 +45,7 @@ #define PFX "powernow-k8: " #define VERSION "version 2.20.00" #include "powernow-k8.h" +#include "mperf.h" /* serialize freq changes */ static DEFINE_MUTEX(fidvid_mutex); @@ -54,6 +54,15 @@ static DEFINE_PER_CPU(struct powernow_k8 static int cpu_family = CPU_OPTERON; +/* array to map SW pstate number to acpi state */ +static u32 ps_to_as[8]; + +/* core performance boost */ +static bool cpb_capable, cpb_enabled; +static struct msr __percpu *msrs; + +static struct cpufreq_driver cpufreq_amd64_driver; + #ifndef CONFIG_SMP static inline const struct cpumask *cpu_core_mask(int cpu) { @@ -76,7 +85,7 @@ static u32 find_khz_freq_from_fid(u32 fi static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 pstate) { - return data[pstate].frequency; + return data[ps_to_as[pstate]].frequency; } /* Return the vco fid for an input fid @@ -919,22 +928,27 @@ static int fill_powernow_table_pstate(st invalidate_entry(powernow_table, i); continue; } - rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); - if (!(hi & HW_PSTATE_VALID_MASK)) { - dprintk("invalid pstate %d, ignoring\n", index); - invalidate_entry(powernow_table, i); - continue; - } - powernow_table[i].index = index; + ps_to_as[index] = i; /* Frequency may be rounded for these */ - if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { + if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10) + || boot_cpu_data.x86 == 0x11) { + + rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi); + if (!(hi & HW_PSTATE_VALID_MASK)) { + dprintk("invalid pstate %d, ignoring\n", index); + invalidate_entry(powernow_table, i); + continue; + } + powernow_table[i].frequency = freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); } else powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; + + powernow_table[i].index = index; } return 0; } @@ -1016,13 +1030,12 @@ static int get_transition_latency(struct } if (max_latency == 0) { /* - * Fam 11h always returns 0 as transition latency. - * This is intended and means "very fast". While cpufreq core - * and governors currently can handle that gracefully, better - * set it to 1 to avoid problems in the future. - * For all others it's a BIOS bug. + * Fam 11h and later may return 0 as transition latency. This + * is intended and means "very fast". While cpufreq core and + * governors currently can handle that gracefully, better set it + * to 1 to avoid problems in the future. */ - if (boot_cpu_data.x86 != 0x11) + if (boot_cpu_data.x86 < 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; @@ -1174,7 +1187,8 @@ static int powernowk8_target(struct cpuf powernow_k8_acpi_pst_values(data, newstate); if (cpu_family == CPU_HW_PSTATE) - ret = transition_frequency_pstate(data, newstate); + ret = transition_frequency_pstate(data, + data->powernow_table[newstate].index); else ret = transition_frequency_fidvid(data, newstate); if (ret) { @@ -1187,7 +1201,7 @@ static int powernowk8_target(struct cpuf if (cpu_family == CPU_HW_PSTATE) pol->cur = find_khz_freq_from_pstate(data->powernow_table, - newstate); + data->powernow_table[newstate].index); else pol->cur = find_khz_freq_from_fid(data->currfid); ret = 0; @@ -1243,6 +1257,7 @@ static int __cpuinit powernowk8_cpu_init struct powernow_k8_data *data; struct init_on_cpu init_on_cpu; int rc; + struct cpuinfo_x86 *c = &cpu_data(pol->cpu); if (!cpu_online(pol->cpu)) return -ENODEV; @@ -1317,6 +1332,10 @@ static int __cpuinit powernowk8_cpu_init return -EINVAL; } + /* Check for APERF/MPERF support in hardware */ + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf; + cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); if (cpu_family == CPU_HW_PSTATE) @@ -1351,6 +1370,7 @@ static int __devexit powernowk8_cpu_exit kfree(data->powernow_table); kfree(data); + per_cpu(powernow_data, pol->cpu) = NULL; return 0; } @@ -1370,7 +1390,7 @@ static unsigned int powernowk8_get(unsig int err; if (!data) - return -EINVAL; + return 0; smp_call_function_single(cpu, query_values_on_cpu, &err, true); if (err) @@ -1387,8 +1407,77 @@ out: return khz; } +static void _cpb_toggle_msrs(bool t) +{ + int cpu; + + get_online_cpus(); + + rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + for_each_cpu(cpu, cpu_online_mask) { + struct msr *reg = per_cpu_ptr(msrs, cpu); + if (t) + reg->l &= ~BIT(25); + else + reg->l |= BIT(25); + } + wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + put_online_cpus(); +} + +/* + * Switch on/off core performance boosting. + * + * 0=disable + * 1=enable. + */ +static void cpb_toggle(bool t) +{ + if (!cpb_capable) + return; + + if (t && !cpb_enabled) { + cpb_enabled = true; + _cpb_toggle_msrs(t); + printk(KERN_INFO PFX "Core Boosting enabled.\n"); + } else if (!t && cpb_enabled) { + cpb_enabled = false; + _cpb_toggle_msrs(t); + printk(KERN_INFO PFX "Core Boosting disabled.\n"); + } +} + +static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, + size_t count) +{ + int ret = -EINVAL; + unsigned long val = 0; + + ret = strict_strtoul(buf, 10, &val); + if (!ret && (val == 0 || val == 1) && cpb_capable) + cpb_toggle(val); + else + return -EINVAL; + + return count; +} + +static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf) +{ + return sprintf(buf, "%u\n", cpb_enabled); +} + +#define define_one_rw(_name) \ +static struct freq_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +define_one_rw(cpb); + static struct freq_attr *powernow_k8_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, + &cpb, NULL, }; @@ -1403,10 +1492,52 @@ static struct cpufreq_driver cpufreq_amd .attr = powernow_k8_attr, }; +/* + * Clear the boost-disable flag on the CPU_DOWN path so that this cpu + * cannot block the remaining ones from boosting. On the CPU_UP path we + * simply keep the boost-disable flag in sync with the current global + * state. + */ +static int cpb_notify(struct notifier_block *nb, unsigned long action, + void *hcpu) +{ + unsigned cpu = (long)hcpu; + u32 lo, hi; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + + if (!cpb_enabled) { + rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); + lo |= BIT(25); + wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); + } + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); + lo &= ~BIT(25); + wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpb_nb = { + .notifier_call = cpb_notify, +}; + /* driver entry point for init */ static int __cpuinit powernowk8_init(void) { - unsigned int i, supported_cpus = 0; + unsigned int i, supported_cpus = 0, cpu; + int rv; for_each_online_cpu(i) { int rc; @@ -1415,15 +1546,42 @@ static int __cpuinit powernowk8_init(voi supported_cpus++; } - if (supported_cpus == num_online_cpus()) { - printk(KERN_INFO PFX "Found %d %s " - "processors (%d cpu cores) (" VERSION ")\n", - num_online_nodes(), - boot_cpu_data.x86_model_id, supported_cpus); - return cpufreq_register_driver(&cpufreq_amd64_driver); + if (supported_cpus != num_online_cpus()) + return -ENODEV; + + printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n", + num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus); + + if (boot_cpu_has(X86_FEATURE_CPB)) { + + cpb_capable = true; + + register_cpu_notifier(&cpb_nb); + + msrs = msrs_alloc(); + if (!msrs) { + printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); + return -ENOMEM; + } + + rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + for_each_cpu(cpu, cpu_online_mask) { + struct msr *reg = per_cpu_ptr(msrs, cpu); + cpb_enabled |= !(!!(reg->l & BIT(25))); + } + + printk(KERN_INFO PFX "Core Performance Boosting: %s.\n", + (cpb_enabled ? "on" : "off")); } - return -ENODEV; + rv = cpufreq_register_driver(&cpufreq_amd64_driver); + if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) { + unregister_cpu_notifier(&cpb_nb); + msrs_free(msrs); + msrs = NULL; + } + return rv; } /* driver entry point for term */ @@ -1431,6 +1589,13 @@ static void __exit powernowk8_exit(void) { dprintk("exit\n"); + if (boot_cpu_has(X86_FEATURE_CPB)) { + msrs_free(msrs); + msrs = NULL; + + unregister_cpu_notifier(&cpb_nb); + } + cpufreq_unregister_driver(&cpufreq_amd64_driver); } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpufreq/powernow-k8.h linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/powernow-k8.h --- linux-2.6.32/arch/x86/kernel/cpu/cpufreq/powernow-k8.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpufreq/powernow-k8.h 2015-03-10 13:22:21.000000000 -0700 @@ -5,7 +5,6 @@ * http://www.gnu.org/licenses/gpl.html */ - enum pstate { HW_PSTATE_INVALID = 0xff, HW_PSTATE_0 = 0, @@ -55,7 +54,6 @@ struct powernow_k8_data { struct cpumask *available_cores; }; - /* processor's cpuid instruction support */ #define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ #define CPUID_XFAM 0x0ff00000 /* extended family */ diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cpu.h linux-2.6.32.ovz/arch/x86/kernel/cpu/cpu.h --- linux-2.6.32/arch/x86/kernel/cpu/cpu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cpu.h 2015-03-10 13:22:47.000000000 -0700 @@ -18,6 +18,7 @@ struct cpu_dev { struct cpu_model_info c_models[4]; void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); @@ -32,6 +33,6 @@ struct cpu_dev { extern const struct cpu_dev *const __x86_cpu_dev_start[], *const __x86_cpu_dev_end[]; -extern void display_cacheinfo(struct cpuinfo_x86 *c); +extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); #endif diff -uprN linux-2.6.32/arch/x86/kernel/cpu/cyrix.c linux-2.6.32.ovz/arch/x86/kernel/cpu/cyrix.c --- linux-2.6.32/arch/x86/kernel/cpu/cyrix.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/cyrix.c 2015-03-10 13:23:16.000000000 -0700 @@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cp /* Handle the GX (Formally known as the GX2) */ if (c->x86 == 5 && c->x86_model == 5) - display_cacheinfo(c); + cpu_detect_cache_sizes(c); else init_cyrix(c); } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/hypervisor.c linux-2.6.32.ovz/arch/x86/kernel/cpu/hypervisor.c --- linux-2.6.32/arch/x86/kernel/cpu/hypervisor.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/hypervisor.c 2015-03-10 13:23:54.000000000 -0700 @@ -21,37 +21,63 @@ * */ +#include #include -#include #include -static inline void __cpuinit -detect_hypervisor_vendor(struct cpuinfo_x86 *c) +/* + * Hypervisor detect order. This is specified explicitly here because + * some hypervisors might implement compatibility modes for other + * hypervisors and therefore need to be detected in specific sequence. + */ +static const __initconst struct hypervisor_x86 * const hypervisors[] = { - if (vmware_platform()) - c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; - else - c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; -} + &x86_hyper_vmware, + &x86_hyper_ms_hyperv, + &x86_hyper_kvm, +}; -static inline void __cpuinit -hypervisor_set_feature_bits(struct cpuinfo_x86 *c) +const struct hypervisor_x86 *x86_hyper; +EXPORT_SYMBOL(x86_hyper); + +static inline void __init +detect_hypervisor_vendor(void) { - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { - vmware_set_feature_bits(c); - return; + const struct hypervisor_x86 *h, * const *p; + + for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { + h = *p; + if (h->detect()) { + x86_hyper = h; + printk(KERN_INFO "Hypervisor detected: %s\n", h->name); + break; + } } } void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) { - detect_hypervisor_vendor(c); - hypervisor_set_feature_bits(c); + if (x86_hyper && x86_hyper->set_cpu_features) + x86_hyper->set_cpu_features(c); } void __init init_hypervisor_platform(void) { + + detect_hypervisor_vendor(); + + if (!x86_hyper) + return; + init_hypervisor(&boot_cpu_data); - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) - vmware_platform_setup(); + + if (x86_hyper->init_platform) + x86_hyper->init_platform(); +} + +bool __init hypervisor_x2apic_available(void) +{ + return x86_hyper && + x86_hyper->x2apic_available && + x86_hyper->x2apic_available(); } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/intel.c linux-2.6.32.ovz/arch/x86/kernel/cpu/intel.c --- linux-2.6.32/arch/x86/kernel/cpu/intel.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/intel.c 2015-06-23 04:16:16.000000000 -0700 @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -30,10 +29,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { + u64 misc_enable; + /* Unmask CPUID levels if masked: */ if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { - u64 misc_enable; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { @@ -47,6 +46,36 @@ static void __cpuinit early_init_intel(s (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { + unsigned lower_word; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* Required by the SDM */ + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + + /* + * Atom erratum AAE44/AAF40/AAG38/AAH41: + * + * A race condition between speculative fetches and invalidating + * a large page. This is worked around in microcode, but we + * need the microcode to have already been loaded... so if it is + * not, recommend a BIOS update and disable large pages. + */ + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) { + u32 ucode, junk; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, junk, ucode); + + if (ucode < 0x20e) { + printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); + } + } + #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #else @@ -70,8 +99,8 @@ static void __cpuinit early_init_intel(s if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); - sched_clock_stable = 1; + if (!check_tsc_unstable()) + sched_clock_stable = 1; } /* @@ -97,8 +126,6 @@ static void __cpuinit early_init_intel(s * (model 2) with the same problem. */ if (c->x86 == 15) { - u64 misc_enable; - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { @@ -109,6 +136,19 @@ static void __cpuinit early_init_intel(s } } #endif + + /* + * If fast string is not enabled in IA32_MISC_ENABLE for any reason, + * clear the fast string and enhanced fast string CPU capabilities. + */ + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { + rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) { + printk(KERN_INFO "Disabled fast string operations\n"); + setup_clear_cpu_cap(X86_FEATURE_REP_GOOD); + setup_clear_cpu_cap(X86_FEATURE_ERMS); + } + } } #ifdef CONFIG_X86_32 @@ -266,8 +306,6 @@ static void __cpuinit srat_detect_node(s if (node == NUMA_NO_NODE || !node_online(node)) node = first_node(node_online_map); numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } @@ -327,6 +365,31 @@ static void __cpuinit detect_vmx_virtcap } } +static void intel_set_cpuid_faulting(bool enable) +{ + unsigned int l1, l2; + + rdmsr(MSR_MISC_FEATURES_ENABLES, l1, l2); + l1 &= ~1; + if (enable) + l1 |= 1; + wrmsr(MSR_MISC_FEATURES_ENABLES, l1, l2); +} + +static void __cpuinit intel_cpuid_faulting_init(struct cpuinfo_x86 *c) +{ + unsigned int l1, l2; + + if (rdmsr_safe(MSR_PLATFORM_INFO, &l1, &l2) != 0 || + !(l1 & (1 << 31))) + return; + + set_cpu_cap(c, X86_FEATURE_CPUID_FAULTING); + set_cpuid_faulting_cb = intel_set_cpuid_faulting; + + intel_set_cpuid_faulting(false); +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -350,12 +413,6 @@ static void __cpuinit init_intel(struct set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - if (c->cpuid_level > 6) { - unsigned ecx = cpuid_ecx(6); - if (ecx & 0x01) - set_cpu_cap(c, X86_FEATURE_APERFMPERF); - } - if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (cpu_has_ds) { @@ -365,7 +422,6 @@ static void __cpuinit init_intel(struct set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) set_cpu_cap(c, X86_FEATURE_PEBS); - ds_init_intel(c); } if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) @@ -434,6 +490,8 @@ static void __cpuinit init_intel(struct if (cpu_has(c, X86_FEATURE_VMX)) detect_vmx_virtcap(c); + + intel_cpuid_faulting_init(c); } #ifdef CONFIG_X86_32 diff -uprN linux-2.6.32/arch/x86/kernel/cpu/intel_cacheinfo.c linux-2.6.32.ovz/arch/x86/kernel/cpu/intel_cacheinfo.c --- linux-2.6.32/arch/x86/kernel/cpu/intel_cacheinfo.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/intel_cacheinfo.c 2015-03-10 13:23:10.000000000 -0700 @@ -17,7 +17,8 @@ #include #include -#include +#include +#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -94,7 +95,7 @@ static const struct _cache_table __cpuin { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ + { 0xd7, LVL_3, 2048 }, /* 8-way set assoc, 64 byte line size */ { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ @@ -102,6 +103,9 @@ static const struct _cache_table __cpuin { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ + { 0xea, LVL_3, 12288 }, /* 24-way set assoc, 64 byte line size */ + { 0xeb, LVL_3, 18432 }, /* 24-way set assoc, 64 byte line size */ + { 0xec, LVL_3, 24576 }, /* 24-way set assoc, 64 byte line size */ { 0x00, 0, 0} }; @@ -142,12 +146,18 @@ union _cpuid4_leaf_ecx { u32 full; }; +struct amd_l3_cache { + struct amd_northbridge *nb; + unsigned indices; + u8 subcaches[4]; +}; + struct _cpuid4_info { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + struct amd_l3_cache *l3; DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; @@ -157,7 +167,7 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - unsigned long can_disable; + struct amd_l3_cache *l3; }; unsigned short num_cache_leaves; @@ -287,35 +297,273 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_ (ebx->split.ways_of_associativity + 1) - 1; } -static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +struct _cache_attr { + struct attribute attr; + ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); + ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, + unsigned int); +}; + +#ifdef CONFIG_AMD_NB + +/* + * L3 cache descriptors + */ +static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) { - if (index < 3) - return; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; - if (boot_cpu_data.x86 == 0x11) - return; + pci_read_config_dword(l3->nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - /* see erratum #382 */ - if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; +} + +static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +{ + static struct amd_l3_cache *__cpuinitdata l3_caches; + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3 || amd_nb_num() == 0) return; - this_leaf->can_disable = 1; + /* + * Strictly speaking, the amount in @size below is leaked since it is + * never freed but this is done only on shutdown so it doesn't matter. + */ + if (!l3_caches) { + int size = amd_nb_num() * sizeof(struct amd_l3_cache); + + l3_caches = kzalloc(size, GFP_ATOMIC); + if (!l3_caches) + return; + } + + node = amd_get_nb_id(smp_processor_id()); + + if (!l3_caches[node].nb) { + l3_caches[node].nb = node_to_amd_nb(node); + amd_calc_l3_indices(&l3_caches[node]); + } + + this_leaf->l3 = &l3_caches[node]; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int slot) +{ + int index; + + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return -EINVAL; + + index = amd_get_l3_disable_slot(this_leaf->l3, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); + + return sprintf(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ +{ \ + return show_cache_disable(this_leaf, buf, slot); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, + unsigned slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!l3->subcaches[i]) + continue; + + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, + unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(l3, slot); + if (ret >= 0) + return -EINVAL; + + if (index > l3->indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(l3, !slot)) + return -EINVAL; + + amd_l3_disable_index(l3, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!this_leaf->l3 || + !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return -EINVAL; + + cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + + if (strict_strtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + if (err) { + if (err == -EEXIST) + printk(KERN_WARNING "L3 disable slot %d in use!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count, \ + unsigned int cpu) \ +{ \ + return store_cache_disable(this_leaf, buf, count, slot); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); + +static ssize_t +show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) +{ + if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t +store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, + unsigned int cpu) +{ + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return -EINVAL; + + if (strict_strtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; } +static struct _cache_attr subcaches = + __ATTR(subcaches, 0644, show_subcaches, store_subcaches); + +#else /* CONFIG_AMD_NB */ +#define amd_init_l3_cache(x, y) +#endif /* CONFIG_AMD_NB */ + static int __cpuinit cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - if (boot_cpu_data.x86 >= 0x10) - amd_check_l3_disable(index, this_leaf); + amd_init_l3_cache(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -488,22 +736,6 @@ unsigned int __cpuinit init_intel_cachei #endif } - if (trace) - printk(KERN_INFO "CPU: Trace cache: %dK uops", trace); - else if (l1i) - printk(KERN_INFO "CPU: L1 I cache: %dK", l1i); - - if (l1d) - printk(KERN_CONT ", L1 D cache: %dK\n", l1d); - else - printk(KERN_CONT "\n"); - - if (l2) - printk(KERN_INFO "CPU: L2 cache: %dK\n", l2); - - if (l3) - printk(KERN_INFO "CPU: L3 cache: %dK\n", l3); - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); return l2; @@ -516,25 +748,55 @@ static DEFINE_PER_CPU(struct _cpuid4_inf #define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) #ifdef CONFIG_SMP -static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) + +static int __cpuinit cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) { - struct _cpuid4_info *this_leaf, *sibling_leaf; - unsigned long num_threads_sharing; - int index_msb, i; + struct _cpuid4_info *this_leaf; + int ret, i, sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); - if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { - struct cpuinfo_x86 *d; - for_each_online_cpu(i) { + ret = 0; + if (index == 3) { + ret = 1; + for_each_cpu(i, c->llc_shared_map) { if (!per_cpu(cpuid4_info, i)) continue; - d = &cpu_data(i); this_leaf = CPUID4_INFO_IDX(i, index); - cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), - d->llc_shared_map); + for_each_cpu(sibling, c->llc_shared_map) { + if (!cpu_online(sibling)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } } - return; + } else if ((c->x86 == 0x15) && ((index == 1) || (index == 2))) { + ret = 1; + for_each_cpu(i, cpu_sibling_mask(cpu)) { + if (!per_cpu(cpuid4_info, i)) + continue; + this_leaf = CPUID4_INFO_IDX(i, index); + for_each_cpu(sibling, cpu_sibling_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + set_bit(sibling, this_leaf->shared_cpu_map); + } + } + } + + return ret; +} + +static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) +{ + struct _cpuid4_info *this_leaf, *sibling_leaf; + unsigned long num_threads_sharing; + int index_msb, i; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86_vendor == X86_VENDOR_AMD) { + if (cache_shared_amd_cpu_map_setup(cpu, index)) + return; } + this_leaf = CPUID4_INFO_IDX(cpu, index); num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; @@ -660,8 +922,8 @@ static DEFINE_PER_CPU(struct _index_kobj #define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ -static ssize_t show_##file_name \ - (struct _cpuid4_info *this_leaf, char *buf) \ +static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ + unsigned int cpu) \ { \ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } @@ -672,7 +934,8 @@ show_one_plus(physical_line_partition, e show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); -static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { return sprintf(buf, "%luK\n", this_leaf->size / 1024); } @@ -696,17 +959,20 @@ static ssize_t show_shared_cpu_map_func( return n; } -static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 0, buf); } -static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) +static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, + unsigned int cpu) { return show_shared_cpu_map_func(leaf, 1, buf); } -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, + unsigned int cpu) { switch (this_leaf->eax.split.type) { case CACHE_TYPE_DATA: @@ -723,82 +989,6 @@ static ssize_t show_type(struct _cpuid4_ #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, - unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned int reg = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!dev) - return -EINVAL; - - pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "%x\n", reg); -} - -#define SHOW_CACHE_DISABLE(index) \ -static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ -{ \ - return show_cache_disable(this_leaf, buf, index); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, unsigned int index) -{ - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned long val = 0; - unsigned int scrubber = 0; - - if (!this_leaf->can_disable) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (!dev) - return -EINVAL; - - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; - - val |= 0xc0000000; - - pci_read_config_dword(dev, 0x58, &scrubber); - scrubber &= ~0x1f000000; - pci_write_config_dword(dev, 0x58, scrubber); - - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); - wbinvd(); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - return count; -} - -#define STORE_CACHE_DISABLE(index) \ -static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count) \ -{ \ - return store_cache_disable(this_leaf, buf, count, index); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); -}; - #define define_one_ro(_name) \ static struct _cache_attr _name = \ __ATTR(_name, 0444, show_##_name, NULL) @@ -813,11 +1003,6 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); - static struct attribute *default_attrs[] = { &type.attr, &level.attr, @@ -828,11 +1013,45 @@ static struct attribute *default_attrs[] &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, - &cache_disable_0.attr, - &cache_disable_1.attr, NULL }; +#ifdef CONFIG_AMD_NB +static struct attribute ** __cpuinit amd_l3_attrs(void) +{ + static struct attribute **attrs; + int n; + + if (attrs) + return attrs; + + n = sizeof (default_attrs) / sizeof (struct attribute *); + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); + if (attrs == NULL) + return attrs = default_attrs; + + for (n = 0; default_attrs[n]; n++) + attrs[n] = default_attrs[n]; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + attrs[n++] = &cache_disable_0.attr; + attrs[n++] = &cache_disable_1.attr; + } + + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + attrs[n++] = &subcaches.attr; + + return attrs; +} +#endif + static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) { struct _cache_attr *fattr = to_attr(attr); @@ -841,7 +1060,7 @@ static ssize_t show(struct kobject *kobj ret = fattr->show ? fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf) : + buf, this_leaf->cpu) : 0; return ret; } @@ -855,12 +1074,12 @@ static ssize_t store(struct kobject *kob ret = fattr->store ? fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, count) : + buf, count, this_leaf->cpu) : 0; return ret; } -static struct sysfs_ops sysfs_ops = { +static const struct sysfs_ops sysfs_ops = { .show = show, .store = store, }; @@ -920,6 +1139,7 @@ static int __cpuinit cache_add_dev(struc unsigned int cpu = sys_dev->id; unsigned long i, j; struct _index_kobject *this_object; + struct _cpuid4_info *this_leaf; int retval; retval = cpuid4_cache_sysfs_init(cpu); @@ -938,6 +1158,14 @@ static int __cpuinit cache_add_dev(struc this_object = INDEX_KOBJECT_PTR(cpu, i); this_object->cpu = cpu; this_object->index = i; + + this_leaf = CPUID4_INFO_IDX(cpu, i); + + ktype_cache.default_attrs = default_attrs; +#ifdef CONFIG_AMD_NB + if (this_leaf->l3) + ktype_cache.default_attrs = amd_l3_attrs(); +#endif retval = kobject_init_and_add(&(this_object->kobj), &ktype_cache, per_cpu(cache_kobject, cpu), diff -uprN linux-2.6.32/arch/x86/kernel/cpu/Makefile linux-2.6.32.ovz/arch/x86/kernel/cpu/Makefile --- linux-2.6.32/arch/x86/kernel/cpu/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/Makefile 2015-03-10 13:24:14.000000000 -0700 @@ -13,7 +13,9 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o sched.o +obj-y += vmware.o hypervisor.o sched.o mshyperv.o +obj-y += rdrand.o +obj-y += match.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o @@ -29,6 +31,13 @@ obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o +ifdef CONFIG_PERF_EVENTS +obj-$(CONFIG_CPU_SUP_AMD) += perf_event_amd.o perf_event_amd_uncore.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_p6.o perf_event_p4.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o +obj-$(CONFIG_CPU_SUP_INTEL) += perf_event_intel_uncore.o +endif + obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ diff -uprN linux-2.6.32/arch/x86/kernel/cpu/match.c linux-2.6.32.ovz/arch/x86/kernel/cpu/match.c --- linux-2.6.32/arch/x86/kernel/cpu/match.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/match.c 2015-03-10 13:24:00.000000000 -0700 @@ -0,0 +1,48 @@ +#include +#include +#include +#include + +/** + * x86_match_cpu - match current CPU again an array of x86_cpu_ids + * @match: Pointer to array of x86_cpu_ids. Last entry terminated with + * {}. + * + * Return the entry if the current CPU matches the entries in the + * passed x86_cpu_id match table. Otherwise NULL. The match table + * contains vendor (X86_VENDOR_*), family, model and feature bits or + * respective wildcard entries. + * + * A typical table entry would be to match a specific CPU + * { X86_VENDOR_INTEL, 6, 0x12 } + * or to match a specific CPU feature + * { X86_FEATURE_MATCH(X86_FEATURE_FOOBAR) } + * + * Fields can be wildcarded with %X86_VENDOR_ANY, %X86_FAMILY_ANY, + * %X86_MODEL_ANY, %X86_FEATURE_ANY or 0 (except for vendor) + * + * Arrays used to match for this should also be declared using + * MODULE_DEVICE_TABLE(x86_cpu, ...) + * + * This always matches against the boot cpu, assuming models and features are + * consistent over all CPUs. + */ +const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) +{ + const struct x86_cpu_id *m; + struct cpuinfo_x86 *c = &boot_cpu_data; + + for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) + continue; + if (m->family != X86_FAMILY_ANY && c->x86 != m->family) + continue; + if (m->model != X86_MODEL_ANY && c->x86_model != m->model) + continue; + if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) + continue; + return m; + } + return NULL; +} +EXPORT_SYMBOL(x86_match_cpu); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/Makefile linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/Makefile --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/Makefile 2015-03-10 13:22:01.000000000 -0700 @@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += thres obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o + +obj-$(CONFIG_ACPI_APEI) += mce-apei.o diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce_amd.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce_amd.c --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce_amd.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce_amd.c 2015-03-10 13:23:22.000000000 -0700 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -30,8 +31,6 @@ #include #include -#define PFX "mce_threshold: " -#define VERSION "version 1.1.1" #define NR_BANKS 6 #define NR_BLOCKS 9 #define THRESHOLD_MAX 0xFFF @@ -53,17 +52,12 @@ struct threshold_block { unsigned int cpu; u32 address; u16 interrupt_enable; + bool interrupt_capable; u16 threshold_limit; struct kobject kobj; struct list_head miscj; }; -/* defaults used early on boot */ -static struct threshold_block threshold_defaults = { - .interrupt_enable = 0, - .threshold_limit = THRESHOLD_MAX, -}; - struct threshold_bank { struct kobject *kobj; struct threshold_block *blocks; @@ -88,49 +82,125 @@ static void amd_threshold_interrupt(void struct thresh_restart { struct threshold_block *b; int reset; + int set_lvt_off; + int lvt_off; u16 old_limit; }; -/* must be called with correct cpu affinity */ -/* Called via smp_call_function_single() */ +static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits) +{ + /* + * bank 4 supports APIC LVT interrupts implicitly since forever. + */ + if (bank == 4) + return true; + + /* + * IntP: interrupt present; if this bit is set, the thresholding + * bank can generate APIC LVT interrupts + */ + return msr_high_bits & BIT(28); +} + +static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi) +{ + int msr = (hi & MASK_LVTOFF_HI) >> 20; + + if (apic < 0) { + pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu, + b->bank, b->block, b->address, hi, lo); + return 0; + } + + if (apic != msr) { + pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d " + "for bank %d, block %d (MSR%08X=0x%x%08x)\n", + b->cpu, apic, b->bank, b->block, b->address, hi, lo); + return 0; + } + + return 1; +}; + +/* + * Called via smp_call_function_single(), must be called with correct + * cpu affinity. + */ static void threshold_restart_bank(void *_tr) { struct thresh_restart *tr = _tr; - u32 mci_misc_hi, mci_misc_lo; + u32 hi, lo; - rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + rdmsr(tr->b->address, lo, hi); - if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) + if (tr->b->threshold_limit < (hi & THRESHOLD_MAX)) tr->reset = 1; /* limit cannot be lower than err count */ if (tr->reset) { /* reset err count and overflow bit */ - mci_misc_hi = - (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | + hi = + (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | (THRESHOLD_MAX - tr->b->threshold_limit); } else if (tr->old_limit) { /* change limit w/o reset */ - int new_count = (mci_misc_hi & THRESHOLD_MAX) + + int new_count = (hi & THRESHOLD_MAX) + (tr->old_limit - tr->b->threshold_limit); - mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | + hi = (hi & ~MASK_ERR_COUNT_HI) | (new_count & THRESHOLD_MAX); } - tr->b->interrupt_enable ? - (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : - (mci_misc_hi &= ~MASK_INT_TYPE_HI); + /* clear IntType */ + hi &= ~MASK_INT_TYPE_HI; + + if (!tr->b->interrupt_capable) + goto done; - mci_misc_hi |= MASK_COUNT_EN_HI; - wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); + if (tr->set_lvt_off) { + if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) { + /* set new lvt offset */ + hi &= ~MASK_LVTOFF_HI; + hi |= tr->lvt_off << 20; + } + } + + if (tr->b->interrupt_enable) + hi |= INT_TYPE_APIC; + + done: + + hi |= MASK_COUNT_EN_HI; + wrmsr(tr->b->address, lo, hi); +} + +static void mce_threshold_block_init(struct threshold_block *b, int offset) +{ + struct thresh_restart tr = { + .b = b, + .set_lvt_off = 1, + .lvt_off = offset, + }; + + b->threshold_limit = THRESHOLD_MAX; + threshold_restart_bank(&tr); +}; + +static int setup_APIC_mce(int reserved, int new) +{ + if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR, + APIC_EILVT_MSG_FIX, 0)) + return new; + + return reserved; } /* cpu init entry point, called from mce.c with preempt off */ void mce_amd_feature_init(struct cpuinfo_x86 *c) { + struct threshold_block b; unsigned int cpu = smp_processor_id(); u32 low = 0, high = 0, address = 0; unsigned int bank, block; - struct thresh_restart tr; - u8 lvt_off; + int offset = -1; for (bank = 0; bank < NR_BANKS; ++bank) { for (block = 0; block < NR_BLOCKS; ++block) { @@ -140,6 +210,7 @@ void mce_amd_feature_init(struct cpuinfo address = (low & MASK_BLKPTR_LO) >> 21; if (!address) break; + address += MCG_XBLK_ADDR; } else ++address; @@ -147,12 +218,8 @@ void mce_amd_feature_init(struct cpuinfo if (rdmsr_safe(address, &low, &high)) break; - if (!(high & MASK_VALID_HI)) { - if (block) - continue; - else - break; - } + if (!(high & MASK_VALID_HI)) + continue; if (!(high & MASK_CNTP_HI) || (high & MASK_LOCKED_HI)) @@ -164,19 +231,19 @@ void mce_amd_feature_init(struct cpuinfo if (shared_bank[bank] && c->cpu_core_id) break; #endif - lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, - APIC_EILVT_MSG_FIX, 0); - - high &= ~MASK_LVTOFF_HI; - high |= lvt_off << 20; - wrmsr(address, low, high); - - threshold_defaults.address = address; - tr.b = &threshold_defaults; - tr.reset = 0; - tr.old_limit = 0; - threshold_restart_bank(&tr); + memset(&b, 0, sizeof(b)); + b.cpu = cpu; + b.bank = bank; + b.block = block; + b.address = address; + b.interrupt_capable = lvt_interrupt_supported(bank, high); + + if (b.interrupt_capable) { + int new = (high & MASK_LVTOFF_HI) >> 20; + offset = setup_APIC_mce(offset, new); + } + mce_threshold_block_init(&b, offset); mce_threshold_vector = amd_threshold_interrupt; } } @@ -274,14 +341,16 @@ store_interrupt_enable(struct threshold_ struct thresh_restart tr; unsigned long new; + if (!b->interrupt_capable) + return -EINVAL; + if (strict_strtoul(buf, 0, &new) < 0) return -EINVAL; b->interrupt_enable = !!new; + memset(&tr, 0, sizeof(tr)); tr.b = b; - tr.reset = 0; - tr.old_limit = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -302,10 +371,10 @@ store_threshold_limit(struct threshold_b if (new < 1) new = 1; + memset(&tr, 0, sizeof(tr)); tr.old_limit = b->threshold_limit; b->threshold_limit = new; tr.b = b; - tr.reset = 0; smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); @@ -356,10 +425,10 @@ RW_ATTR(threshold_limit); RW_ATTR(error_count); static struct attribute *default_attrs[] = { - &interrupt_enable.attr, &threshold_limit.attr, &error_count.attr, - NULL + NULL, /* possibly interrupt_enable if supported, see below */ + NULL, }; #define to_block(k) container_of(k, struct threshold_block, kobj) @@ -388,7 +457,7 @@ static ssize_t store(struct kobject *kob return ret; } -static struct sysfs_ops threshold_ops = { +static const struct sysfs_ops threshold_ops = { .show = show, .store = store, }; @@ -433,8 +502,14 @@ static __cpuinit int allocate_threshold_ b->cpu = cpu; b->address = address; b->interrupt_enable = 0; + b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; + if (b->interrupt_capable) + threshold_ktype.default_attrs[2] = &interrupt_enable.attr; + else + threshold_ktype.default_attrs[2] = NULL; + INIT_LIST_HEAD(&b->miscj); if (per_cpu(threshold_banks, cpu)[bank]->blocks) { @@ -529,7 +604,7 @@ static __cpuinit int threshold_create_ba err = -ENOMEM; goto out; } - if (!alloc_cpumask_var(&b->cpus, GFP_KERNEL)) { + if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) { kfree(b); err = -ENOMEM; goto out; @@ -542,7 +617,7 @@ static __cpuinit int threshold_create_ba #ifndef CONFIG_SMP cpumask_setall(b->cpus); #else - cpumask_copy(b->cpus, c->llc_shared_map); + cpumask_set_cpu(cpu, b->cpus); #endif per_cpu(threshold_banks, cpu)[bank] = b; @@ -584,9 +659,9 @@ static __cpuinit int threshold_create_de continue; err = threshold_create_bank(cpu, bank); if (err) - goto out; + return err; } -out: + return err; } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce-apei.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce-apei.c --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce-apei.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce-apei.c 2015-03-10 13:24:13.000000000 -0700 @@ -0,0 +1,148 @@ +/* + * Bridge between MCE and APEI + * + * On some machine, corrected memory errors are reported via APEI + * generic hardware error source (GHES) instead of corrected Machine + * Check. These corrected memory errors can be reported to user space + * through /dev/mcelog via faking a corrected Machine Check, so that + * the error memory page can be offlined by /sbin/mcelog if the error + * count for one page is beyond the threshold. + * + * For fatal MCE, save MCE record into persistent storage via ERST, so + * that the MCE record can be logged after reboot via ERST. + * + * Copyright 2010 Intel Corp. + * Author: Huang Ying + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include + +#include "mce-internal.h" + +void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +{ + struct mce m; + + /* Only corrected MC is reported */ + if (!corrected || !(mem_err->validation_bits & CPER_MEM_VALID_PA)) + return; + + mce_setup(&m); + m.bank = 1; + /* Fake a memory read corrected error with unknown channel */ + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + m.addr = mem_err->physical_addr; + mce_log(&m); + mce_notify_irq(); +} +EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); + +#define CPER_CREATOR_MCE \ + UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) +#define CPER_SECTION_TYPE_MCE \ + UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) + +/* + * CPER specification (in UEFI specification 2.3 appendix N) requires + * byte-packed. + */ +struct cper_mce_record { + struct cper_record_header hdr; + struct cper_section_descriptor sec_hdr; + struct mce mce; +} __packed; + +int apei_write_mce(struct mce *m) +{ + struct cper_mce_record rcd; + + memset(&rcd, 0, sizeof(rcd)); + memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + rcd.hdr.revision = CPER_RECORD_REV; + rcd.hdr.signature_end = CPER_SIG_END; + rcd.hdr.section_count = 1; + rcd.hdr.error_severity = CPER_SEV_FATAL; + /* timestamp, platform_id, partition_id are all invalid */ + rcd.hdr.validation_bits = 0; + rcd.hdr.record_length = sizeof(rcd); + rcd.hdr.creator_id = CPER_CREATOR_MCE; + rcd.hdr.notification_type = CPER_NOTIFY_MCE; + rcd.hdr.record_id = cper_next_record_id(); + rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; + + rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; + rcd.sec_hdr.section_length = sizeof(rcd.mce); + rcd.sec_hdr.revision = CPER_SEC_REV; + /* fru_id and fru_text is invalid */ + rcd.sec_hdr.validation_bits = 0; + rcd.sec_hdr.flags = CPER_SEC_PRIMARY; + rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; + rcd.sec_hdr.section_severity = CPER_SEV_FATAL; + + memcpy(&rcd.mce, m, sizeof(*m)); + + return erst_write(&rcd.hdr); +} + +ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + struct cper_mce_record rcd; + int rc, pos; + + rc = erst_get_record_id_begin(&pos); + if (rc) + return rc; +retry: + rc = erst_get_record_id_next(&pos, record_id); + if (rc) + goto out; + /* no more record */ + if (*record_id == APEI_ERST_INVALID_RECORD_ID) + goto out; + rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd)); + /* someone else has cleared the record, try next one */ + if (rc == -ENOENT) + goto retry; + else if (rc < 0) + goto out; + /* try to skip other type records in storage */ + else if (rc != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) + goto retry; + memcpy(m, &rcd.mce, sizeof(*m)); + rc = sizeof(*m); +out: + erst_get_record_id_end(); + + return rc; +} + +/* Check whether there is record in ERST */ +int apei_check_mce(void) +{ + return erst_get_record_count(); +} + +int apei_clear_mce(u64 record_id) +{ + return erst_clear(record_id); +} diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce.c --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce.c 2015-03-10 13:24:10.000000000 -0700 @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -46,6 +47,11 @@ #include "mce-internal.h" +static DEFINE_MUTEX(mce_read_mutex); + +#define CREATE_TRACE_POINTS +#include + int mce_disabled __read_mostly; #define MISC_MCELOG_MINOR 227 @@ -56,26 +62,28 @@ atomic_t mce_entry; DEFINE_PER_CPU(unsigned, mce_exception_count); -/* - * Tolerant levels: - * 0: always panic on uncorrected errors, log corrected errors - * 1: panic or SIGBUS on uncorrected errors, log corrected errors - * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors - * 3: never panic or SIGBUS, log all errors (for testing only) - */ -static int tolerant __read_mostly = 1; -static int banks __read_mostly; static int rip_msr __read_mostly; static int mce_bootlog __read_mostly = -1; static int monarch_timeout __read_mostly = -1; static int mce_panic_timeout __read_mostly; -static int mce_dont_log_ce __read_mostly; int mce_cmci_disabled __read_mostly; int mce_ignore_ce __read_mostly; int mce_ser __read_mostly; +int mce_bios_cmci_threshold __read_mostly; struct mce_bank *mce_banks __read_mostly; +struct mca_config mca_cfg __read_mostly = { + /* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corr. errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ + .tolerant = 1 +}; + /* User mode helper program triggered by machine check event */ static unsigned long mce_need_notify; static char mce_helper[128]; @@ -85,26 +93,33 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait) static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -static void default_decode_mce(struct mce *m) -{ - pr_emerg("No human readable MCE decoding support on this CPU type.\n"); - pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); -} - -/* - * CPU/chipset specific EDAC code can register a callback here to print - * MCE errors in a human-readable form: - */ -void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; -EXPORT_SYMBOL(x86_mce_decode_callback); +/* CMCI storm detection filter */ +static DEFINE_PER_CPU(unsigned long, mce_polled_error); /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL }; +/* + * MCA banks controlled through firmware first for corrected errors. + * This is a global list of banks for which we won't enable CMCI and we + * won't poll. Firmware controls these banks and is responsible for + * reporting corrected errors through GHES. Uncorrected/recoverable + * errors are still notified through a machine check. + */ +mce_banks_t mce_banks_ce_disabled; + static DEFINE_PER_CPU(struct work_struct, mce_work); +static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs); + +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -140,12 +155,21 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + int ret = 0; + + /* Emit the trace record: */ + trace_mce_record(mce); + + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (ret == NOTIFY_STOP) + return; mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference(mcelog.next); for (;;) { + /* * When the buffer fills up discard new entries. * Assume that the earlier errors are the more @@ -177,13 +201,66 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +static void drain_mcelog_buffer(void) +{ + unsigned int next, i, prev = 0; + + next = rcu_dereference(mcelog.next); + + do { + struct mce *m; + + /* drain what was logged during boot */ + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + unsigned retries = 1; + + m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2*retries)) + retries++; + + cpu_relax(); + + if (!m->finished && retries >= 4) { + pr_err("MCE: skipping error being logged currently!\n"); + break; + } + } + smp_rmb(); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + } + + memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + static void print_mce(struct mce *m) { - pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + int ret = 0; + + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { - pr_emerg("RIP%s %02x:<%016Lx> ", + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", m->cs, m->ip); @@ -192,31 +269,29 @@ static void print_mce(struct mce *m) pr_cont("\n"); } - pr_emerg("TSC %llx ", m->tsc); + pr_emerg(HW_ERR "TSC %llx ", m->tsc); if (m->addr) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %u\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + cpu_data(m->extcpu).microcode); /* * Print out human-readable details about the MCE error, - * (if the CPU has an implementation for that): + * (if the CPU has an implementation for that) */ - x86_mce_decode_callback(m); -} - -static void print_mce_head(void) -{ - pr_emerg("\nHARDWARE ERROR\n"); -} - -static void print_mce_tail(void) -{ - pr_emerg("This is not a software problem!\n"); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + if (ret != NOTIFY_STOP) + printk_ratelimited(KERN_EMERG HW_ERR + "Run the above through 'mcelog --ascii'\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -242,7 +317,7 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { - int i; + int i, apei_err = 0; if (!fake_panic) { /* @@ -259,14 +334,16 @@ static void mce_panic(char *msg, struct if (atomic_inc_return(&mce_fake_paniced) > 1) return; } - print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; if (!(m->status & MCI_STATUS_VAL)) continue; - if (!(m->status & MCI_STATUS_UC)) + if (!(m->status & MCI_STATUS_UC)) { print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -275,22 +352,27 @@ static void mce_panic(char *msg, struct continue; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || memcmp(m, final, sizeof(struct mce))) + if (!final || memcmp(m, final, sizeof(struct mce))) { print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } } - if (final) + if (final) { print_mce(final); + if (!apei_err) + apei_err = apei_write_mce(final); + } if (cpu_missing) - printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); - print_mce_tail(); + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) - printk(KERN_EMERG "Machine check: %s\n", exp); + pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mce_panic_timeout; panic(msg); } else - printk(KERN_EMERG "Fake kernel panic: %s\n", msg); + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -351,6 +433,31 @@ static void mce_wrmsrl(u32 msr, u64 v) } /* + * Collect all global (w.r.t. this processor) status about this machine + * check into our "mce" struct so that we can use it later to assess + * the severity of the problem as we read per-bank specific details. + */ +static inline void mce_gather_info(struct mce *m, struct pt_regs *regs) +{ + mce_setup(m); + + m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + if (regs) { + /* + * Get the address of the instruction at the time of + * the machine check error. + */ + if (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV)) { + m->ip = regs->ip; + m->cs = regs->cs; + } + /* Use accurate RIP reporting if available. */ + if (rip_msr) + m->ip = mce_rdmsrl(rip_msr); + } +} + +/* * Simple lockless ring to communicate PFNs from the exception handler with the * process context work function. This is vastly simplified because there's * only a single reader and a single writer. @@ -421,24 +528,6 @@ static void mce_schedule_work(void) } } -/* - * Get the address of the instruction at the time of the machine check - * error. - */ -static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) -{ - - if (regs && (m->mcgstatus & (MCG_STATUS_RIPV|MCG_STATUS_EIPV))) { - m->ip = regs->ip; - m->cs = regs->cs; - } else { - m->ip = 0; - m->cs = 0; - } - if (rip_msr) - m->ip = mce_rdmsrl(rip_msr); -} - #ifdef CONFIG_X86_LOCAL_APIC /* * Called after interrupts have been reenabled again @@ -495,6 +584,39 @@ static void mce_report_event(struct pt_r #endif } +static int westmere; + +static int +mce_quirk(int bank, struct mce *m) +{ + if (westmere && bank == 6 && ((m->status >> 16) & 0xffff) == 0x2000) { + mce_wrmsrl(MSR_IA32_MCx_STATUS(6), 0); + return 1; + } + return 0; +} + +/* + * Read ADDR and MISC registers. + */ +static void mce_read_aux(struct mce *m, int i) +{ + if (m->status & MCI_STATUS_MISCV) + m->misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); + if (m->status & MCI_STATUS_ADDRV) { + m->addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + + /* + * Mask the reported address by the reported granularity. + */ + if (mce_ser && (m->status & MCI_STATUS_MISCV)) { + u8 shift = MCI_MISC_ADDR_LSB(m->misc); + m->addr >>= shift; + m->addr <<= shift; + } + } +} + DEFINE_PER_CPU(unsigned, mce_poll_count); /* @@ -516,13 +638,13 @@ void machine_check_poll(enum mcp_flags f { struct mce m; int i; + unsigned long *v; - __get_cpu_var(mce_poll_count)++; + percpu_inc(mce_poll_count); - mce_setup(&m); + mce_gather_info(&m, NULL); - m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -536,6 +658,12 @@ void machine_check_poll(enum mcp_flags f if (!(m.status & MCI_STATUS_VAL)) continue; + if (mce_quirk(i, &m)) + continue; + + v = &get_cpu_var(mce_polled_error); + set_bit(0, v); + put_cpu_var(mce_polled_error); /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -546,10 +674,7 @@ void machine_check_poll(enum mcp_flags f (m.status & (mce_ser ? MCI_STATUS_S : MCI_STATUS_UC))) continue; - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); if (!(flags & MCP_TIMESTAMP)) m.tsc = 0; @@ -557,10 +682,8 @@ void machine_check_poll(enum mcp_flags f * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) mce_log(&m); - add_taint(TAINT_MACHINE_CHECK); - } /* * Clear state for this bank. @@ -581,16 +704,22 @@ EXPORT_SYMBOL_GPL(machine_check_poll); * Do a quick check if any of the events requires a panic. * This decides if we keep the events around or clear them. */ -static int mce_no_way_out(struct mce *m, char **msg) +static int mce_no_way_out(struct mce *m, char **msg, unsigned long *validp, + struct pt_regs *regs) { - int i; + int i, ret = 0; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { m->status = mce_rdmsrl(MSR_IA32_MCx_STATUS(i)); - if (mce_severity(m, tolerant, msg) >= MCE_PANIC_SEVERITY) - return 1; + if (m->status & MCI_STATUS_VAL) { + __set_bit(i, validp); + if (quirk_no_way_out) + quirk_no_way_out(i, m, regs); + } + if (mce_severity(m, mca_cfg.tolerant, msg) >= MCE_PANIC_SEVERITY) + ret = 1; } - return 0; + return ret; } /* @@ -622,7 +751,7 @@ static int mce_timed_out(u64 *t) goto out; if ((s64)*t < SPINUNIT) { /* CHECKME: Make panic default for 1 too? */ - if (tolerant < 1) + if (mca_cfg.tolerant < 1) mce_panic("Timeout synchronizing machine check over CPUs", NULL, NULL); cpu_missing = 1; @@ -672,7 +801,8 @@ static void mce_reign(void) * Grade the severity of the errors of all the CPUs. */ for_each_possible_cpu(cpu) { - int severity = mce_severity(&per_cpu(mces_seen, cpu), tolerant, + int severity = mce_severity(&per_cpu(mces_seen, cpu), + mca_cfg.tolerant, &nmsg); if (severity > global_worst) { msg = nmsg; @@ -686,7 +816,7 @@ static void mce_reign(void) * This dumps all the mces in the log buffer and stops the * other CPUs. */ - if (m && global_worst >= MCE_PANIC_SEVERITY && tolerant < 3) + if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Fatal Machine check", m, msg); /* @@ -699,7 +829,7 @@ static void mce_reign(void) * No machine check event found. Must be some external * source or one CPU is hung. Panic. */ - if (global_worst <= MCE_KEEP_SEVERITY && tolerant < 3) + if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) mce_panic("Machine check from unknown source", NULL, NULL); /* @@ -857,9 +987,9 @@ static int mce_usable_address(struct mce { if (!(m->status & MCI_STATUS_MISCV) || !(m->status & MCI_STATUS_ADDRV)) return 0; - if ((m->misc & 0x3f) > PAGE_SHIFT) + if (MCI_MISC_ADDR_LSB(m->misc) > PAGE_SHIFT) return 0; - if (((m->misc >> 6) & 7) != MCM_ADDR_PHYS) + if (MCI_MISC_ADDR_MODE(m->misc) != MCI_MISC_ADDR_PHYS) return 0; return 1; } @@ -868,13 +998,58 @@ static void mce_clear_state(unsigned lon { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { if (test_bit(i, toclear)) mce_wrmsrl(MSR_IA32_MCx_STATUS(i), 0); } } /* + * Need to save faulting physical address associated with a process + * in the machine check handler some place where we can grab it back + * later in mce_notify_process() + */ +#define MCE_INFO_MAX 16 + +struct mce_info { + atomic_t inuse; + struct task_struct *t; + __u64 paddr; + int restartable; +} mce_info[MCE_INFO_MAX]; + +static void mce_save_info(__u64 addr, int c) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) { + if (atomic_cmpxchg(&mi->inuse, 0, 1) == 0) { + mi->t = current; + mi->paddr = addr; + mi->restartable = c; + return; + } + } + + mce_panic("Too many concurrent recoverable errors", NULL, NULL); +} + +static struct mce_info *mce_find_info(void) +{ + struct mce_info *mi; + + for (mi = mce_info; mi < &mce_info[MCE_INFO_MAX]; mi++) + if (atomic_read(&mi->inuse) && mi->t == current) + return mi; + return NULL; +} + +static void mce_clear_info(struct mce_info *mi) +{ + atomic_set(&mi->inuse, 0); +} + +/* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. * @@ -899,7 +1074,7 @@ void do_machine_check(struct pt_regs *re int order; /* * If no_way_out gets set, there is no safe way to recover from this - * MCE. If tolerant is cranked up, we'll try anyway. + * MCE. If mca_cfg.tolerant is cranked up, we'll try anyway. */ int no_way_out = 0; /* @@ -908,30 +1083,30 @@ void do_machine_check(struct pt_regs *re */ int kill_it = 0; DECLARE_BITMAP(toclear, MAX_NR_BANKS); + DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; atomic_inc(&mce_entry); - __get_cpu_var(mce_exception_count)++; + percpu_inc(mce_exception_count); - if (notify_die(DIE_NMI, "machine check", regs, error_code, - 18, SIGKILL) == NOTIFY_STOP) - goto out; - if (!banks) + if (!mca_cfg.banks) goto out; - mce_setup(&m); + mce_gather_info(&m, regs); - m.mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); final = &__get_cpu_var(mces_seen); *final = m; - no_way_out = mce_no_way_out(&m, &msg); + memset(valid_banks, 0, sizeof(valid_banks)); + no_way_out = mce_no_way_out(&m, &msg, valid_banks, regs); barrier(); /* - * When no restart IP must always kill or panic. + * When no restart IP might need to kill or panic. + * Assume the worst for now, but if we find the + * severity is MCE_AR_SEVERITY we have other options. */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) kill_it = 1; @@ -942,8 +1117,10 @@ void do_machine_check(struct pt_regs *re * because the first one to see it will clear it. */ order = mce_start(&no_way_out); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { __clear_bit(i, toclear); + if (!test_bit(i, valid_banks)) + continue; if (!mce_banks[i].ctl) continue; @@ -968,7 +1145,7 @@ void do_machine_check(struct pt_regs *re */ add_taint(TAINT_MACHINE_CHECK); - severity = mce_severity(&m, tolerant, NULL); + severity = mce_severity(&m, mca_cfg.tolerant, NULL); /* * When machine check was for corrected handler don't touch, @@ -985,28 +1162,18 @@ void do_machine_check(struct pt_regs *re continue; } - /* - * Kill on action required. - */ - if (severity == MCE_AR_SEVERITY) - kill_it = 1; - - if (m.status & MCI_STATUS_MISCV) - m.misc = mce_rdmsrl(MSR_IA32_MCx_MISC(i)); - if (m.status & MCI_STATUS_ADDRV) - m.addr = mce_rdmsrl(MSR_IA32_MCx_ADDR(i)); + mce_read_aux(&m, i); /* * Action optional error. Queue address for later processing. * When the ring overflows we just ignore the AO error. * RED-PEN add some logging mechanism when * usable_address or mce_add_ring fails. - * RED-PEN don't ignore overflow for tolerant == 0 + * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0 */ if (severity == MCE_AO_SEVERITY && mce_usable_address(&m)) mce_ring_add(m.addr >> PAGE_SHIFT); - mce_get_rip(&m, regs); mce_log(&m); if (severity > worst) { @@ -1015,6 +1182,9 @@ void do_machine_check(struct pt_regs *re } } + /* mce_clear_state will clear *final, save locally for use later */ + m = *final; + if (!no_way_out) mce_clear_state(toclear); @@ -1026,27 +1196,22 @@ void do_machine_check(struct pt_regs *re no_way_out = worst >= MCE_PANIC_SEVERITY; /* - * If we have decided that we just CAN'T continue, and the user - * has not set tolerant to an insane level, give up and die. - * - * This is mainly used in the case when the system doesn't - * support MCE broadcasting or it has been disabled. - */ - if (no_way_out && tolerant < 3) - mce_panic("Fatal machine check on current CPU", final, msg); - - /* - * If the error seems to be unrecoverable, something should be - * done. Try to kill as little as possible. If we can kill just - * one task, do that. If the user has set the tolerance very - * high, don't try to do anything at all. - */ - - if (kill_it && tolerant < 3) - force_sig(SIGBUS, current); - - /* notify userspace ASAP */ - set_thread_flag(TIF_MCE_NOTIFY); + * At insane "tolerant" levels we take no action. Otherwise + * we only die if we have no other choice. For less serious + * issues we try to recover, or limit damage to the current + * process. + */ + if (mca_cfg.tolerant < 3) { + if (no_way_out) + mce_panic("Fatal machine check on current CPU", &m, msg); + if (worst == MCE_AR_SEVERITY) { + /* schedule action before return to userland */ + mce_save_info(m.addr, m.mcgstatus & MCG_STATUS_RIPV); + set_thread_flag(TIF_MCE_NOTIFY); + } else if (kill_it) { + force_sig(SIGBUS, current); + } + } if (worst > 0) mce_report_event(regs); @@ -1057,34 +1222,65 @@ out: } EXPORT_SYMBOL_GPL(do_machine_check); -/* dummy to break dependency. actual code is in mm/memory-failure.c */ -void __attribute__((weak)) memory_failure(unsigned long pfn, int vector) +#ifndef CONFIG_MEMORY_FAILURE +int memory_failure(unsigned long pfn, int vector, int flags) { - printk(KERN_ERR "Action optional memory failure at %lx ignored\n", pfn); + /* mce_severity() should not hand us an ACTION_REQUIRED error */ + BUG_ON(flags & MF_ACTION_REQUIRED); + printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n" + "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn); + + return 0; } +#endif /* - * Called after mce notification in process context. This code - * is allowed to sleep. Call the high level VM handler to process - * any corrupted pages. - * Assume that the work queue code only calls this one at a time - * per CPU. - * Note we don't disable preemption, so this code might run on the wrong - * CPU. In this case the event is picked up by the scheduled work queue. - * This is merely a fast path to expedite processing in some common - * cases. + * Called in process context that interrupted by MCE and marked with + * TIF_MCE_NOTIFY, just before returning to erroneous userland. + * This code is allowed to sleep. + * Attempt possible recovery such as calling the high level VM handler to + * process any corrupted pages, and kill/signal current process if required. + * Action required errors are handled here. */ void mce_notify_process(void) { unsigned long pfn; - mce_notify_irq(); - while (mce_ring_get(&pfn)) - memory_failure(pfn, MCE_VECTOR); + struct mce_info *mi = mce_find_info(); + int flags = MF_ACTION_REQUIRED; + + if (!mi) + mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL); + pfn = mi->paddr >> PAGE_SHIFT; + + clear_thread_flag(TIF_MCE_NOTIFY); + + pr_err("Uncorrected hardware memory error in user-access at %llx", + mi->paddr); + /* + * We must call memory_failure() here even if the current process is + * doomed. We still need to mark the page as poisoned and alert any + * other users of the page. + */ + if (!mi->restartable) + flags |= MF_MUST_KILL; + if (memory_failure(pfn, MCE_VECTOR, flags) < 0) { + pr_err("Memory error not recovered"); + force_sig(SIGBUS, current); + } + mce_clear_info(mi); } +/* + * Action optional processing happens here (picking up + * from the list of faulting pages that do_machine_check() + * placed into the "ring"). + */ static void mce_process_work(struct work_struct *dummy) { - mce_notify_process(); + unsigned long pfn; + + while (mce_ring_get(&pfn)) + memory_failure(pfn, MCE_VECTOR, 0); } #ifdef CONFIG_X86_MCE_INTEL @@ -1117,35 +1313,78 @@ void mce_log_therm_throt_event(__u64 sta * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static int check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = 5 * 60; /* 5 minutes */ -static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ +static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); -static void mcheck_timer(unsigned long data) +static unsigned long mce_adjust_timer_default(unsigned long interval) { - struct timer_list *t = &per_cpu(mce_timer, data); - int *n; + return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = + mce_adjust_timer_default; + +static int cmc_error_seen(void) +{ + unsigned long *v = &__get_cpu_var(mce_polled_error); + + return test_and_clear_bit(0, v); +} + +static void mce_timer_fn(unsigned long data) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long *iv; + int notify; WARN_ON(smp_processor_id() != data); if (mce_available(¤t_cpu_data)) { machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_poll_banks)); + mce_intel_cmci_poll(); } /* * Alert userspace if needed. If we logged an MCE, reduce the * polling interval, otherwise increase the polling interval. */ - n = &__get_cpu_var(mce_next_interval); - if (mce_notify_irq()) - *n = max(*n/2, HZ/100); - else - *n = min(*n*2, (int)round_jiffies_relative(check_interval*HZ)); + iv = &__get_cpu_var(mce_next_interval); + notify = mce_notify_irq(); + notify |= cmc_error_seen(); + if (notify) { + *iv = max(*iv / 2, (unsigned long) HZ/100); + } else { + *iv = min(*iv * 2, round_jiffies_relative(check_interval * HZ)); + *iv = mce_adjust_timer(*iv); + } + /* Might have become 0 after CMCI storm subsided */ + if (*iv) { + t->expires = jiffies + *iv; + add_timer_on(t, smp_processor_id()); + } +} + +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned long when = jiffies + interval; + unsigned long *iv = &__get_cpu_var(mce_next_interval); - t->expires = jiffies + *n; - add_timer_on(t, smp_processor_id()); + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + if (interval < *iv) + *iv = interval; } static void mce_do_trigger(struct work_struct *work) @@ -1165,8 +1404,6 @@ int mce_notify_irq(void) /* Not more than two messages every minute */ static DEFINE_RATELIMIT_STATE(ratelimit, 60*HZ, 2); - clear_thread_flag(TIF_MCE_NOTIFY); - if (test_and_clear_bit(0, &mce_need_notify)) { wake_up_interruptible(&mce_wait); @@ -1179,7 +1416,7 @@ int mce_notify_irq(void) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); + pr_info(HW_ERR "Machine check events logged\n"); return 1; } @@ -1187,14 +1424,16 @@ int mce_notify_irq(void) } EXPORT_SYMBOL_GPL(mce_notify_irq); -static int mce_banks_init(void) +static int __cpuinit __mcheck_cpu_mce_banks_init(void) { int i; + u8 num_banks = mca_cfg.banks; - mce_banks = kzalloc(banks * sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kzalloc(num_banks * sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < banks; i++) { + + for (i = 0; i < num_banks; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1206,7 +1445,7 @@ static int mce_banks_init(void) /* * Initialize Machine Checks for a CPU. */ -static int __cpuinit mce_cap_init(void) +static int __cpuinit __mcheck_cpu_cap_init(void) { unsigned b; u64 cap; @@ -1214,7 +1453,7 @@ static int __cpuinit mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!banks) + if (!mca_cfg.banks) printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { @@ -1225,10 +1464,11 @@ static int __cpuinit mce_cap_init(void) } /* Don't support asymmetric configurations today */ - WARN_ON(banks != 0 && b != banks); - banks = b; + WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); + mca_cfg.banks = b; + if (!mce_banks) { - int err = mce_banks_init(); + int err = __mcheck_cpu_mce_banks_init(); if (err) return err; @@ -1244,7 +1484,7 @@ static int __cpuinit mce_cap_init(void) return 0; } -static void mce_init(void) +static void __mcheck_cpu_init_generic(void) { mce_banks_t all_banks; u64 cap; @@ -1262,7 +1502,7 @@ static void mce_init(void) if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (!b->init) @@ -1272,9 +1512,39 @@ static void mce_init(void) } } +/* + * During IFU recovery Sandy Bridge -EP4S processors set the RIPV and + * EIPV bits in MCG_STATUS to zero on the affected logical processor (SDM + * Vol 3B Table 15-20). But this confuses both the code that determines + * whether the machine check occurred in kernel or user mode, and also + * the severity assessment code. Pretend that EIPV was set, and take the + * ip/cs values from the pt_regs that mce_gather_info() ignored earlier. + */ +static void quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) +{ + if (bank != 0) + return; + if ((m->mcgstatus & (MCG_STATUS_EIPV|MCG_STATUS_RIPV)) != 0) + return; + if ((m->status & (MCI_STATUS_OVER|MCI_STATUS_UC| + MCI_STATUS_EN|MCI_STATUS_MISCV|MCI_STATUS_ADDRV| + MCI_STATUS_PCC|MCI_STATUS_S|MCI_STATUS_AR| + MCACOD)) != + (MCI_STATUS_UC|MCI_STATUS_EN| + MCI_STATUS_MISCV|MCI_STATUS_ADDRV|MCI_STATUS_S| + MCI_STATUS_AR|MCACOD_INSTR)) + return; + + m->mcgstatus |= MCG_STATUS_EIPV; + m->ip = regs->ip; + m->cs = regs->cs; +} + /* Add per CPU specific workarounds here */ -static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) +static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) { + struct mca_config *cfg = &mca_cfg; + if (c->x86_vendor == X86_VENDOR_UNKNOWN) { pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); return -EOPNOTSUPP; @@ -1282,7 +1552,7 @@ static int __cpuinit mce_cpu_quirks(stru /* This should be disabled by the BIOS, but isn't always */ if (c->x86_vendor == X86_VENDOR_AMD) { - if (c->x86 == 15 && banks > 4) { + if (c->x86 == 15 && cfg->banks > 4) { /* * disable GART TBL walk error reporting, which * trips off incorrectly with the IOMMU & 3ware @@ -1301,8 +1571,45 @@ static int __cpuinit mce_cpu_quirks(stru * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && banks > 0) + if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 val, hwcr; + bool need_toggle; + u32 msrs[] = { + 0x00000413, /* MC4_MISC0 */ + 0xc0000408, /* MC4_MISC1 */ + }; + + rdmsrl(MSR_K7_HWCR, hwcr); + + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); + + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + + for (i = 0; i < ARRAY_SIZE(msrs); i++) { + rdmsrl(msrs[i], val); + + /* CntP bit set? */ + if (val & BIT_64(62)) { + val &= ~BIT_64(62); + wrmsrl(msrs[i], val); + } + } + + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1315,7 +1622,7 @@ static int __cpuinit mce_cpu_quirks(stru * valid event later, merely don't write CTL0. */ - if (c->x86 == 6 && c->x86_model < 0x1A && banks > 0) + if (c->x86 == 6 && c->x86_model < 0x1A && cfg->banks > 0) mce_banks[0].init = 0; /* @@ -1332,6 +1639,14 @@ static int __cpuinit mce_cpu_quirks(stru */ if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0) mce_bootlog = 0; + + if (c->x86 == 6 && c->x86_model == 45) + quirk_no_way_out = quirk_sandybridge_ifu; + /* + * Westmere-EX systems can report spurious corrected errors + */ + if (c->x86 == 6 && c->x86_model == 47) + westmere = 1; } if (monarch_timeout < 0) monarch_timeout = 0; @@ -1341,7 +1656,7 @@ static int __cpuinit mce_cpu_quirks(stru return 0; } -static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) +static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c) { if (c->x86 != 5) return; @@ -1355,11 +1670,12 @@ static void __cpuinit mce_ancient_init(s } } -static void mce_cpu_features(struct cpuinfo_x86 *c) +static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) { switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); + mce_adjust_timer = mce_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); @@ -1369,20 +1685,26 @@ static void mce_cpu_features(struct cpui } } -static void mce_init_timer(void) +static void mce_start_timer(unsigned int cpu, struct timer_list *t) { - struct timer_list *t = &__get_cpu_var(mce_timer); - int *n = &__get_cpu_var(mce_next_interval); + unsigned long iv = check_interval * HZ; - if (mce_ignore_ce) + if (mce_ignore_ce || !iv) return; - *n = check_interval * HZ; - if (!*n) - return; - setup_timer(t, mcheck_timer, smp_processor_id()); - t->expires = round_jiffies(jiffies + *n); - add_timer_on(t, smp_processor_id()); + per_cpu(mce_next_interval, cpu) = iv; + + t->expires = round_jiffies(jiffies + iv); + add_timer_on(t, cpu); +} + +static void __mcheck_cpu_init_timer(void) +{ + struct timer_list *t = &__get_cpu_var(mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_timer(t, mce_timer_fn, cpu); + mce_start_timer(cpu, t); } /* Handle unconfigured int18 (should never happen) */ @@ -1400,27 +1722,28 @@ void (*machine_check_vector)(struct pt_r * Called for each booted CPU to set up machine checks. * Must be called with preempt off: */ -void __cpuinit mcheck_init(struct cpuinfo_x86 *c) +void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c) { if (mce_disabled) return; - mce_ancient_init(c); + __mcheck_cpu_ancient_init(c); if (!mce_available(c)) return; - if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { + if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) { mce_disabled = 1; return; } machine_check_vector = do_machine_check; - mce_init(); - mce_cpu_features(c); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(c); + __mcheck_cpu_init_timer(); INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); + } /* @@ -1469,7 +1792,48 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } -static DEFINE_MUTEX(mce_read_mutex); +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + /* + * When ERST is disabled, mce_chrdev_read() should return + * "no record" instead of "no device." + */ + if (rc == -ENODEV) + return 0; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) @@ -1484,15 +1848,19 @@ static ssize_t mce_read(struct file *fil return -ENOMEM; mutex_lock(&mce_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + next = rcu_dereference(mcelog.next); /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - - return -EINVAL; - } + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; err = 0; prev = 0; @@ -1540,10 +1908,15 @@ timeout: memset(&mcelog.entry[i], 0, sizeof(struct mce)); } } + + if (err) + err = -EFAULT; + +out: mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); - return err ? -EFAULT : buf - ubuf; + return err ? err : buf - ubuf; } static unsigned int mce_poll(struct file *file, poll_table *wait) @@ -1551,6 +1924,8 @@ static unsigned int mce_poll(struct file poll_wait(file, &mce_wait, wait); if (rcu_dereference(mcelog.next)) return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; return 0; } @@ -1596,6 +1971,25 @@ static struct miscdevice mce_log_device &mce_chrdev_ops, }; +static void __mce_disable_bank(void *arg) +{ + int bank = *((int *)arg); + __clear_bit(bank, __get_cpu_var(mce_poll_banks)); + cmci_disable_bank(bank); +} + +void mce_disable_bank(int bank) +{ + if (bank >= mca_cfg.banks) { + pr_warn(FW_BUG + "Ignoring request to disable invalid MCA bank %d.\n", + bank); + return; + } + set_bit(bank, mce_banks_ce_disabled); + on_each_cpu(__mce_disable_bank, &bank, 1); +} + /* * mce=off Disables machine check * mce=no_cmci Disables CMCI @@ -1606,9 +2000,12 @@ static struct miscdevice mce_log_device * check, or 0 to not wait * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold */ static int __init mcheck_enable(char *str) { + struct mca_config *cfg = &mca_cfg; + if (*str == 0) { enable_p5_mce(); return 1; @@ -1620,13 +2017,15 @@ static int __init mcheck_enable(char *st else if (!strcmp(str, "no_cmci")) mce_cmci_disabled = 1; else if (!strcmp(str, "dont_log_ce")) - mce_dont_log_ce = 1; + cfg->dont_log_ce = true; else if (!strcmp(str, "ignore_ce")) mce_ignore_ce = 1; else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog")) mce_bootlog = (str[0] == 'b'); + else if (!strcmp(str, "bios_cmci_threshold")) + mce_bios_cmci_threshold = 1; else if (isdigit(str[0])) { - get_option(&str, &tolerant); + get_option(&str, &(cfg->tolerant)); if (*str == ',') { ++str; get_option(&str, &monarch_timeout); @@ -1640,6 +2039,13 @@ static int __init mcheck_enable(char *st } __setup("mce", mcheck_enable); +int __init mcheck_init(void) +{ + mcheck_intel_therm_init(); + + return 0; +} + /* * Sysfs support */ @@ -1648,11 +2054,11 @@ __setup("mce", mcheck_enable); * Disable machine checks on suspend and shutdown. We can't really handle * them later. */ -static int mce_disable(void) +static int mce_disable_error_reporting(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -1663,12 +2069,12 @@ static int mce_disable(void) static int mce_suspend(struct sys_device *dev, pm_message_t state) { - return mce_disable(); + return mce_disable_error_reporting(); } static int mce_shutdown(struct sys_device *dev) { - return mce_disable(); + return mce_disable_error_reporting(); } /* @@ -1678,8 +2084,8 @@ static int mce_shutdown(struct sys_devic */ static int mce_resume(struct sys_device *dev) { - mce_init(); - mce_cpu_features(¤t_cpu_data); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_vendor(¤t_cpu_data); return 0; } @@ -1689,8 +2095,8 @@ static void mce_cpu_restart(void *data) del_timer_sync(&__get_cpu_var(mce_timer)); if (!mce_available(¤t_cpu_data)) return; - mce_init(); - mce_init_timer(); + __mcheck_cpu_init_generic(); + __mcheck_cpu_init_timer(); } /* Reinit MCEs after user configuration changes */ @@ -1716,7 +2122,7 @@ static void mce_enable_ce(void *all) cmci_reenable(); cmci_recheck(); if (all) - mce_init_timer(); + __mcheck_cpu_init_timer(); } static struct sysdev_class mce_sysclass = { @@ -1835,9 +2241,9 @@ static ssize_t store_int_with_restart(st } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); +static SYSDEV_INT_ATTR(tolerant, 0644, mca_cfg.tolerant); static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static SYSDEV_INT_ATTR(dont_log_ce, 0644, mca_cfg.dont_log_ce); static struct sysdev_ext_attribute attr_check_interval = { _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, @@ -1890,7 +2296,7 @@ static __cpuinit int mce_create_device(u if (err) goto error; } - for (j = 0; j < banks; j++) { + for (j = 0; j < mca_cfg.banks; j++) { err = sysdev_create_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); if (err) @@ -1904,7 +2310,7 @@ error2: sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); + sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1921,7 +2327,7 @@ static __cpuinit void mce_remove_device( for (i = 0; mce_attrs[i]; i++) sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]); - for (i = 0; i < banks; i++) + for (i = 0; i < mca_cfg.banks; i++) sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); sysdev_unregister(&per_cpu(mce_dev, cpu)); @@ -1929,16 +2335,17 @@ static __cpuinit void mce_remove_device( } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void __cpuinit mce_disable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; if (!mce_available(¤t_cpu_data)) return; + if (!(action & CPU_TASKS_FROZEN)) cmci_clear(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -1946,7 +2353,7 @@ static void mce_disable_cpu(void *h) } } -static void mce_reenable_cpu(void *h) +static void __cpuinit mce_reenable_cpu(void *h) { unsigned long action = *(unsigned long *)h; int i; @@ -1956,7 +2363,7 @@ static void mce_reenable_cpu(void *h) if (!(action & CPU_TASKS_FROZEN)) cmci_reenable(); - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; if (b->init) @@ -1971,36 +2378,33 @@ mce_cpu_callback(struct notifier_block * unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); - switch (action) { + switch (action & ~CPU_TASKS_FROZEN) { case CPU_ONLINE: - case CPU_ONLINE_FROZEN: mce_create_device(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; case CPU_DEAD: - case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); mce_remove_device(cpu); + mce_intel_hcpu_update(cpu); break; case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - del_timer_sync(t); smp_call_function_single(cpu, mce_disable_cpu, &action, 1); + del_timer_sync(t); break; case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - t->expires = round_jiffies(jiffies + - __get_cpu_var(mce_next_interval)); - add_timer_on(t, cpu); smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); + mce_start_timer(cpu, t); break; - case CPU_POST_DEAD: + } + + if (action == CPU_POST_DEAD) { /* intentionally ignoring frozen here */ cmci_rediscover(cpu); - break; } + return NOTIFY_OK; } @@ -2012,7 +2416,7 @@ static __init void mce_init_banks(void) { int i; - for (i = 0; i < banks; i++) { + for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; struct sysdev_attribute *a = &b->attr; @@ -2025,7 +2429,7 @@ static __init void mce_init_banks(void) } } -static __init int mce_init_device(void) +static __init int mcheck_init_device(void) { int err; int i = 0; @@ -2053,7 +2457,7 @@ static __init int mce_init_device(void) return err; } -device_initcall(mce_init_device); +device_initcall(mcheck_init_device); /* * Old style boot options parsing. Only for compatibility. @@ -2101,7 +2505,7 @@ static int fake_panic_set(void *data, u6 DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, "%llu\n"); -static int __init mce_debugfs_init(void) +static int __init mcheck_debugfs_init(void) { struct dentry *dmce, *ffake_panic; @@ -2115,5 +2519,5 @@ static int __init mce_debugfs_init(void) return 0; } -late_initcall(mce_debugfs_init); +late_initcall(mcheck_debugfs_init); #endif diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce-inject.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce-inject.c --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce-inject.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce-inject.c 2015-03-10 13:22:31.000000000 -0700 @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include /* Update fake mce registers on current CPU. */ static void inject_mce(struct mce *m) @@ -74,7 +76,7 @@ static void raise_exception(struct mce * m->finished = 0; } -static cpumask_t mce_inject_cpumask; +static cpumask_var_t mce_inject_cpumask; static int mce_raise_notify(struct notifier_block *self, unsigned long val, void *data) @@ -82,9 +84,9 @@ static int mce_raise_notify(struct notif struct die_args *args = (struct die_args *)data; int cpu = smp_processor_id(); struct mce *m = &__get_cpu_var(injectm); - if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) + if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) return NOTIFY_DONE; - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); if (m->inject_flags & MCJ_EXCEPTION) raise_exception(m, args->regs); else if (m->status) @@ -94,7 +96,7 @@ static int mce_raise_notify(struct notif static struct notifier_block mce_raise_nb = { .notifier_call = mce_raise_notify, - .priority = 1000, + .priority = NMI_LOCAL_NORMAL_PRIOR, }; /* Inject mce on current CPU */ @@ -148,22 +150,22 @@ static void raise_mce(struct mce *m) unsigned long start; int cpu; get_online_cpus(); - mce_inject_cpumask = cpu_online_map; - cpu_clear(get_cpu(), mce_inject_cpumask); + cpumask_copy(mce_inject_cpumask, cpu_online_mask); + cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); for_each_online_cpu(cpu) { struct mce *mcpu = &per_cpu(injectm, cpu); if (!mcpu->finished || MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) - cpu_clear(cpu, mce_inject_cpumask); + cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpus_empty(mce_inject_cpumask)) - apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) + apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); start = jiffies; - while (!cpus_empty(mce_inject_cpumask)) { + while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR "Timeout waiting for mce inject NMI %lx\n", - *cpus_addr(mce_inject_cpumask)); + *cpumask_bits(mce_inject_cpumask)); break; } cpu_relax(); @@ -210,6 +212,8 @@ static ssize_t mce_write(struct file *fi static int inject_init(void) { + if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL)) + return -ENOMEM; printk(KERN_INFO "Machine check injector initialized\n"); mce_chrdev_ops.write = mce_write; register_die_notifier(&mce_raise_nb); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce_intel.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce_intel.c --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce_intel.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce_intel.c 2015-03-10 13:24:10.000000000 -0700 @@ -5,15 +5,19 @@ * Author: Andi Kleen */ +#include #include #include #include #include +#include #include #include #include #include +#include "mce-internal.h" + /* * Support for Intel Correct Machine Check Interrupts. This allows * the CPU to raise an interrupt when a corrected machine check happened. @@ -29,7 +33,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_b */ static DEFINE_SPINLOCK(cmci_discover_lock); -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD 1 +#define CMCI_POLL_INTERVAL (30 * HZ) +#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_THRESHOLD 15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { + CMCI_STORM_NONE, + CMCI_STORM_ACTIVE, + CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus; static int cmci_supported(int *banks) { @@ -52,6 +71,110 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } +void mce_intel_cmci_poll(void) +{ + if (__get_cpu_var(cmci_storm_state) == CMCI_STORM_NONE) + return; + machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ + if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) + atomic_dec(&cmci_storm_on_cpus); + + per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ + int r; + unsigned int *state = &__get_cpu_var(cmci_storm_state); + + if (interval < CMCI_POLL_INTERVAL) + return interval; + + switch (*state) { + case CMCI_STORM_ACTIVE: + /* + * We switch back to interrupt mode once the poll timer has + * silenced itself. That means no events recorded and the + * timer interval is back to our poll interval. + */ + *state = CMCI_STORM_SUBSIDED; + r = atomic_sub_return(1, &cmci_storm_on_cpus); + if (r == 0) + pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ + + case CMCI_STORM_SUBSIDED: + /* + * We wait for all cpus to go back to SUBSIDED + * state. When that happens we switch back to + * interrupt mode. + */ + if (!atomic_read(&cmci_storm_on_cpus)) { + *state = CMCI_STORM_NONE; + cmci_reenable(); + cmci_recheck(); + } + return CMCI_POLL_INTERVAL; + default: + /* + * We have shiny weather. Let the poll do whatever it + * thinks. + */ + return interval; + } +} + +static void cmci_storm_disable_banks(void) +{ + unsigned long flags, *owned; + int bank; + u64 val; + + spin_lock_irqsave(&cmci_discover_lock, flags); + owned = __get_cpu_var(mce_banks_owned); + for_each_set_bit(bank, owned, MAX_NR_BANKS) { + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + } + spin_unlock_irqrestore(&cmci_discover_lock, flags); +} + +static bool cmci_storm_detect(void) +{ + unsigned int *state = &__get_cpu_var(cmci_storm_state); + unsigned int *cnt = &__get_cpu_var(cmci_storm_cnt); + unsigned long *ts = &__get_cpu_var(cmci_time_stamp); + unsigned long now = jiffies; + int r; + + if (*state != CMCI_STORM_NONE) + return true; + + if (time_before_eq(now, *ts + CMCI_STORM_INTERVAL)) { + (*cnt)++; + } else { + *cnt = 1; + *ts = now; + } + + if (*cnt <= CMCI_STORM_THRESHOLD) + return false; + + cmci_storm_disable_banks(); + *state = CMCI_STORM_ACTIVE; + r = atomic_add_return(1, &cmci_storm_on_cpus); + mce_timer_kick(CMCI_POLL_INTERVAL); + + if (r == 1) + pr_notice("CMCI storm detected: switching to poll mode\n"); + return true; +} + /* * The interrupt handler. This is called on every event. * Just call the poller directly to log any events. @@ -60,63 +183,86 @@ static int cmci_supported(int *banks) */ static void intel_threshold_interrupt(void) { + if (cmci_storm_detect()) + return; machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); mce_notify_irq(); } -static void print_update(char *type, int *hdr, int num) -{ - if (*hdr == 0) - printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); - *hdr = 1; - printk(KERN_CONT " %s:%d", type, num); -} - /* * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks * on this CPU. Use the algorithm recommended in the SDM to discover shared * banks. */ -static void cmci_discover(int banks, int boot) +static void cmci_discover(int banks) { unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned); unsigned long flags; - int hdr = 0; int i; + int bios_wrong_thresh = 0; spin_lock_irqsave(&cmci_discover_lock, flags); for (i = 0; i < banks; i++) { u64 val; + int bios_zero_thresh = 0; if (test_bit(i, owned)) continue; + /* Skip banks in firmware first mode */ + if (test_bit(i, mce_banks_ce_disabled)) + continue; + rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ - if (val & CMCI_EN) { - if (test_and_clear_bit(i, owned) || boot) - print_update("SHD", &hdr, i); + if (val & MCI_CTL2_CMCI_EN) { + clear_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val |= CMCI_EN | CMCI_THRESHOLD; + if (!mce_bios_cmci_threshold) { + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= CMCI_THRESHOLD; + } else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { + /* + * If bios_cmci_threshold boot option was specified + * but the threshold is zero, we'll try to initialize + * it to 1. + */ + bios_zero_thresh = 1; + val |= CMCI_THRESHOLD; + } + + val |= MCI_CTL2_CMCI_EN; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { - if (!test_and_set_bit(i, owned) || boot) - print_update("CMCI", &hdr, i); + if (val & MCI_CTL2_CMCI_EN) { + set_bit(i, owned); __clear_bit(i, __get_cpu_var(mce_poll_banks)); + /* + * We are able to set thresholds for some banks that + * had a threshold of 0. This means the BIOS has not + * set the thresholds properly or does not work with + * this boot option. Note down now and report later. + */ + if (mce_bios_cmci_threshold && bios_zero_thresh && + (val & MCI_CTL2_CMCI_THRESHOLD_MASK)) + bios_wrong_thresh = 1; } else { WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks))); } } spin_unlock_irqrestore(&cmci_discover_lock, flags); - if (hdr) - printk(KERN_CONT "\n"); + if (mce_bios_cmci_threshold && bios_wrong_thresh) { + pr_info_once( + "bios_cmci_threshold: Some banks do not have valid thresholds set\n"); + pr_info_once( + "bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); + } } /* @@ -135,6 +281,19 @@ void cmci_recheck(void) local_irq_restore(flags); } +/* Caller must hold the lock on cmci_discover_lock */ +static void __cmci_disable_bank(int bank) +{ + u64 val; + + if (!test_bit(bank, __get_cpu_var(mce_banks_owned))) + return; + rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + val &= ~MCI_CTL2_CMCI_EN; + wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + __clear_bit(bank, __get_cpu_var(mce_banks_owned)); +} + /* * Disable CMCI on this CPU for all banks it owns when it goes down. * This allows other CPUs to claim the banks on rediscovery. @@ -144,20 +303,12 @@ void cmci_clear(void) unsigned long flags; int i; int banks; - u64 val; if (!cmci_supported(&banks)) return; spin_lock_irqsave(&cmci_discover_lock, flags); - for (i = 0; i < banks; i++) { - if (!test_bit(i, __get_cpu_var(mce_banks_owned))) - continue; - /* Disable CMCI */ - rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); - wrmsrl(MSR_IA32_MCx_CTL2(i), val); - __clear_bit(i, __get_cpu_var(mce_banks_owned)); - } + for (i = 0; i < banks; i++) + __cmci_disable_bank(i); spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -184,7 +335,7 @@ void cmci_rediscover(int dying) continue; /* Recheck banks in case CPUs don't all have the same */ if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); } set_cpus_allowed_ptr(current, old); @@ -198,7 +349,20 @@ void cmci_reenable(void) { int banks; if (cmci_supported(&banks)) - cmci_discover(banks, 0); + cmci_discover(banks); +} + +void cmci_disable_bank(int bank) +{ + int banks; + unsigned long flags; + + if (!cmci_supported(&banks)) + return; + + spin_lock_irqsave(&cmci_discover_lock, flags); + __cmci_disable_bank(bank); + spin_unlock_irqrestore(&cmci_discover_lock, flags); } static void intel_init_cmci(void) @@ -209,7 +373,7 @@ static void intel_init_cmci(void) return; mce_threshold_vector = intel_threshold_interrupt; - cmci_discover(banks, 1); + cmci_discover(banks); /* * For CPU #0 this runs with still disabled APIC, but that's * ok because only the vector is set up. We still do another diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce-internal.h linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce-internal.h --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce-internal.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce-internal.h 2015-03-10 13:24:10.000000000 -0700 @@ -27,4 +27,42 @@ struct dentry *mce_get_debugfs_dir(void) extern int mce_ser; extern struct mce_bank *mce_banks; +extern mce_banks_t mce_banks_ce_disabled; +#ifdef CONFIG_X86_MCE_INTEL +unsigned long mce_intel_adjust_timer(unsigned long interval); +void mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +void cmci_disable_bank(int bank); +#else +# define mce_intel_adjust_timer mce_adjust_timer_default +static inline void mce_intel_cmci_poll(void) { } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +static inline void cmci_disable_bank(int bank) { } +#endif + +void mce_timer_kick(unsigned long interval); + +#ifdef CONFIG_ACPI_APEI +int apei_write_mce(struct mce *m); +ssize_t apei_read_mce(struct mce *m, u64 *record_id); +int apei_check_mce(void); +int apei_clear_mce(u64 record_id); +#else +static inline int apei_write_mce(struct mce *m) +{ + return -EINVAL; +} +static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + return 0; +} +static inline int apei_check_mce(void) +{ + return 0; +} +static inline int apei_clear_mce(u64 record_id) +{ + return -EINVAL; +} +#endif diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce-severity.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce-severity.c --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/mce-severity.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/mce-severity.c 2015-03-10 13:24:00.000000000 -0700 @@ -43,84 +43,155 @@ static struct severity { unsigned char covered; char *msg; } severities[] = { -#define KERNEL .context = IN_KERNEL -#define USER .context = IN_USER -#define SER .ser = SER_REQUIRED -#define NOSER .ser = NO_SER -#define SEV(s) .sev = MCE_ ## s ## _SEVERITY -#define BITCLR(x, s, m, r...) { .mask = x, .result = 0, SEV(s), .msg = m, ## r } -#define BITSET(x, s, m, r...) { .mask = x, .result = x, SEV(s), .msg = m, ## r } -#define MCGMASK(x, res, s, m, r...) \ - { .mcgmask = x, .mcgres = res, SEV(s), .msg = m, ## r } -#define MASK(x, y, s, m, r...) \ - { .mask = x, .result = y, SEV(s), .msg = m, ## r } +#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c } +#define KERNEL .context = IN_KERNEL +#define USER .context = IN_USER +#define SER .ser = SER_REQUIRED +#define NOSER .ser = NO_SER +#define BITCLR(x) .mask = x, .result = 0 +#define BITSET(x) .mask = x, .result = x +#define MCGMASK(x, y) .mcgmask = x, .mcgres = y +#define MASK(x, y) .mask = x, .result = y #define MCI_UC_S (MCI_STATUS_UC|MCI_STATUS_S) #define MCI_UC_SAR (MCI_STATUS_UC|MCI_STATUS_S|MCI_STATUS_AR) -#define MCACOD 0xffff +#define MCI_ADDR (MCI_STATUS_ADDRV|MCI_STATUS_MISCV) - BITCLR(MCI_STATUS_VAL, NO, "Invalid"), - BITCLR(MCI_STATUS_EN, NO, "Not enabled"), - BITSET(MCI_STATUS_PCC, PANIC, "Processor context corrupt"), + MCESEV( + NO, "Invalid", + BITCLR(MCI_STATUS_VAL) + ), + MCESEV( + NO, "Not enabled", + BITCLR(MCI_STATUS_EN) + ), + MCESEV( + PANIC, "Processor context corrupt", + BITSET(MCI_STATUS_PCC) + ), /* When MCIP is not set something is very confused */ - MCGMASK(MCG_STATUS_MCIP, 0, PANIC, "MCIP not set in MCA handler"), + MCESEV( + PANIC, "MCIP not set in MCA handler", + MCGMASK(MCG_STATUS_MCIP, 0) + ), /* Neither return not error IP -- no chance to recover -> PANIC */ - MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0, PANIC, - "Neither restart nor error IP"), - MCGMASK(MCG_STATUS_RIPV, 0, PANIC, "In kernel and no restart IP", - KERNEL), - BITCLR(MCI_STATUS_UC, KEEP, "Corrected error", NOSER), - MASK(MCI_STATUS_OVER|MCI_STATUS_UC|MCI_STATUS_EN, MCI_STATUS_UC, SOME, - "Spurious not enabled", SER), + MCESEV( + PANIC, "Neither restart nor error IP", + MCGMASK(MCG_STATUS_RIPV|MCG_STATUS_EIPV, 0) + ), + MCESEV( + PANIC, "In kernel and no restart IP", + KERNEL, MCGMASK(MCG_STATUS_RIPV, 0) + ), + MCESEV( + KEEP, "Corrected error", + NOSER, BITCLR(MCI_STATUS_UC) + ), /* ignore OVER for UCNA */ - MASK(MCI_UC_SAR, MCI_STATUS_UC, KEEP, - "Uncorrected no action required", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR, PANIC, - "Illegal combination (UCNA with AR=1)", SER), - MASK(MCI_STATUS_S, 0, KEEP, "Non signalled machine check", SER), - - /* AR add known MCACODs here */ - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_SAR, PANIC, - "Action required with lost events", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_SAR, PANIC, - "Action required; unknown MCACOD", SER), + MCESEV( + KEEP, "Uncorrected no action required", + SER, MASK(MCI_UC_SAR, MCI_STATUS_UC) + ), + MCESEV( + PANIC, "Illegal combination (UCNA with AR=1)", + SER, + MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_UC|MCI_STATUS_AR) + ), + MCESEV( + KEEP, "Non signalled machine check", + SER, BITCLR(MCI_STATUS_S) + ), + + MCESEV( + PANIC, "Action required with lost events", + SER, BITSET(MCI_STATUS_OVER|MCI_UC_SAR) + ), + + /* known AR MCACODs: */ +#ifdef CONFIG_MEMORY_FAILURE + MCESEV( + KEEP, "HT thread notices Action required: data load error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + MCGMASK(MCG_STATUS_EIPV, 0) + ), + MCESEV( + AR, "Action required: data load error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_DATA), + USER + ), + MCESEV( + KEEP, "HT thread notices Action required: instruction fetch error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + MCGMASK(MCG_STATUS_EIPV, 0) + ), + MCESEV( + AR, "Action required: instruction fetch error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCI_ADDR|MCACOD, MCI_UC_SAR|MCI_ADDR|MCACOD_INSTR), + USER + ), +#endif + MCESEV( + PANIC, "Action required: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_SAR) + ), /* known AO MCACODs: */ - MASK(MCI_UC_SAR|MCI_STATUS_OVER|0xfff0, MCI_UC_S|0xc0, AO, - "Action optional: memory scrubbing error", SER), - MASK(MCI_UC_SAR|MCI_STATUS_OVER|MCACOD, MCI_UC_S|0x17a, AO, - "Action optional: last level cache writeback error", SER), - - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S, SOME, - "Action optional unknown MCACOD", SER), - MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S|MCI_STATUS_OVER, SOME, - "Action optional with lost events", SER), - BITSET(MCI_STATUS_UC|MCI_STATUS_OVER, PANIC, "Overflowed uncorrected"), - BITSET(MCI_STATUS_UC, UC, "Uncorrected"), - BITSET(0, SOME, "No match") /* always matches. keep at end */ + MCESEV( + AO, "Action optional: memory scrubbing error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD_SCRUBMSK, MCI_UC_S|MCACOD_SCRUB) + ), + MCESEV( + AO, "Action optional: last level cache writeback error", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR|MCACOD, MCI_UC_S|MCACOD_L3WB) + ), + MCESEV( + SOME, "Action optional: unknown MCACOD", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_UC_S) + ), + MCESEV( + SOME, "Action optional with lost events", + SER, MASK(MCI_STATUS_OVER|MCI_UC_SAR, MCI_STATUS_OVER|MCI_UC_S) + ), + + MCESEV( + PANIC, "Overflowed uncorrected", + BITSET(MCI_STATUS_OVER|MCI_STATUS_UC) + ), + MCESEV( + UC, "Uncorrected", + BITSET(MCI_STATUS_UC) + ), + MCESEV( + SOME, "No match", + BITSET(0) + ) /* always matches. keep at end */ }; /* - * If the EIPV bit is set, it means the saved IP is the - * instruction which caused the MCE. + * If mcgstatus indicated that ip/cs on the stack were + * no good, then "m->cs" will be zero and we will have + * to assume the worst case (IN_KERNEL) as we actually + * have no idea what we were executing when the machine + * check hit. + * If we do have a good "m->cs" (or a faked one in the + * case we were executing in VM86 mode) we can use it to + * distinguish an exception taken in user from from one + * taken in the kernel. */ static int error_context(struct mce *m) { - if (m->mcgstatus & MCG_STATUS_EIPV) - return (m->ip && (m->cs & 3) == 3) ? IN_USER : IN_KERNEL; - /* Unknown, assume kernel */ - return IN_KERNEL; + return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *a, int tolerant, char **msg) +int mce_severity(struct mce *m, int tolerant, char **msg) { - enum context ctx = error_context(a); + enum context ctx = error_context(m); struct severity *s; for (s = severities;; s++) { - if ((a->status & s->mask) != s->result) + if ((m->status & s->mask) != s->result) continue; - if ((a->mcgstatus & s->mcgmask) != s->mcgres) + if ((m->mcgstatus & s->mcgmask) != s->mcgres) continue; if (s->ser == SER_REQUIRED && !mce_ser) continue; @@ -196,15 +267,15 @@ static const struct file_operations seve static int __init severities_debugfs_init(void) { - struct dentry *dmce = NULL, *fseverities_coverage = NULL; + struct dentry *dmce, *fsev; dmce = mce_get_debugfs_dir(); - if (dmce == NULL) + if (!dmce) goto err_out; - fseverities_coverage = debugfs_create_file("severities-coverage", - 0444, dmce, NULL, - &severities_coverage_fops); - if (fseverities_coverage == NULL) + + fsev = debugfs_create_file("severities-coverage", 0444, dmce, NULL, + &severities_coverage_fops); + if (!fsev) goto err_out; return 0; @@ -213,4 +284,4 @@ err_out: return -ENOMEM; } late_initcall(severities_debugfs_init); -#endif +#endif /* CONFIG_DEBUG_FS */ diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/therm_throt.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/therm_throt.c --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/therm_throt.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/therm_throt.c 2015-03-10 13:24:02.000000000 -0700 @@ -30,32 +30,47 @@ #include #include #include +#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + /* - * Current thermal throttling state: + * Current thermal event state: */ -struct thermal_state { - bool is_throttled; - +struct _thermal_state { + bool new_event; + int event; u64 next_check; - unsigned long throttle_count; - unsigned long last_throttle_count; + unsigned long count; + unsigned long last_count; +}; + +struct thermal_state { + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; }; static DEFINE_PER_CPU(struct thermal_state, thermal_state); static atomic_t therm_throt_en = ATOMIC_INIT(0); +static u32 lvtthmr_init __read_mostly; + #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + static SYSDEV_ATTR(_name, 0444, \ + therm_throt_sysdev_show_##_name, \ + NULL) \ -#define define_therm_throt_sysdev_show_func(name) \ +#define define_therm_throt_sysdev_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##name( \ +static ssize_t therm_throt_sysdev_show_##event##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -64,30 +79,42 @@ static ssize_t therm_throt_sysdev_show_# ssize_t ret; \ \ preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ + if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).name); \ - else \ + per_cpu(thermal_state, cpu).event.name); \ + } else \ ret = 0; \ preempt_enable(); \ \ return ret; \ } -define_therm_throt_sysdev_show_func(throttle_count); -define_therm_throt_sysdev_one_ro(throttle_count); +define_therm_throt_sysdev_show_func(core_throttle, count); +define_therm_throt_sysdev_one_ro(core_throttle_count); + +define_therm_throt_sysdev_show_func(core_power_limit, count); +define_therm_throt_sysdev_one_ro(core_power_limit_count); + +define_therm_throt_sysdev_show_func(package_throttle, count); +define_therm_throt_sysdev_one_ro(package_throttle_count); + +define_therm_throt_sysdev_show_func(package_power_limit, count); +define_therm_throt_sysdev_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_throttle_count.attr, + &attr_core_throttle_count.attr, NULL }; -static struct attribute_group thermal_throttle_attr_group = { +static struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -104,56 +131,106 @@ static struct attribute_group thermal_th * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(bool is_throttled) +static int therm_throt_process(bool new_event, int event, int level) { - struct thermal_state *state; - unsigned int this_cpu; - bool was_throttled; + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + bool old_event; u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - this_cpu = smp_processor_id(); now = get_jiffies_64(); - state = &per_cpu(thermal_state, this_cpu); + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return 0; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return 0; + } else + return 0; - was_throttled = state->is_throttled; - state->is_throttled = is_throttled; + old_event = state->new_event; + state->new_event = new_event; - if (is_throttled) - state->throttle_count++; + if (new_event) + state->count++; if (time_before64(now, state->next_check) && - state->throttle_count != state->last_throttle_count) + state->count != state->last_count) return 0; state->next_check = now + CHECK_INTERVAL; - state->last_throttle_count = state->throttle_count; + state->last_count = state->count; /* if we just entered the thermal event */ - if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); - - add_taint(TAINT_MACHINE_CHECK); + if (new_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); return 1; } - if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); + if (old_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } return 0; } +static bool int_pln_enable; +static int __init int_pln_enable_setup(char *s) +{ + int_pln_enable = true; + + return 1; +} +__setup("int_pln_enable", int_pln_enable_setup); + #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) +static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, + unsigned int cpu) { - return sysfs_create_group(&sys_dev->kobj, - &thermal_throttle_attr_group); + int err; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_core_power_limit_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PTS)) { + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_throttle_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_power_limit_count.attr, + thermal_attr_group.name); + } + + return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -175,7 +252,7 @@ thermal_throttle_cpu_callback(struct not case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev); + err = thermal_throttle_add_dev(sys_dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -188,7 +265,7 @@ thermal_throttle_cpu_callback(struct not mutex_unlock(&therm_cpu_lock); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = @@ -211,7 +288,7 @@ static __init int thermal_throttle_init_ #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu)); + err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU @@ -230,28 +307,79 @@ static void intel_thermal_interrupt(void __u64 msr_val; rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) + + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL) != 0) mce_log_therm_throt_event(msr_val); + + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL); + + if (this_cpu_has(X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + PACKAGE_LEVEL); + if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable) + therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL); + } } static void unexpected_thermal_interrupt(void) { - printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", + printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", smp_processor_id()); - add_taint(TAINT_MACHINE_CHECK); } static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +static inline void __smp_thermal_interrupt(void) { - exit_idle(); - irq_enter(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); +} + +asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + __smp_thermal_interrupt(); + exiting_ack_irq(); +} + +asmlinkage void smp_trace_thermal_interrupt(struct pt_regs *regs) +{ + entering_irq(); + trace_thermal_apic_entry(THERMAL_APIC_VECTOR); + __smp_thermal_interrupt(); + trace_thermal_apic_exit(THERMAL_APIC_VECTOR); + exiting_ack_irq(); +} + +/* Thermal monitoring depends on APIC, ACPI and clock modulation */ +static int intel_thermal_supported(struct cpuinfo_x86 *c) +{ + if (!cpu_has_apic) + return 0; + if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + return 0; + return 1; +} + +void __init mcheck_intel_therm_init(void) +{ + /* + * This function is only called on boot CPU. Save the init thermal + * LVT value on BSP and use that value to restore APs' thermal LVT + * entry BIOS programmed later + */ + if (intel_thermal_supported(&boot_cpu_data)) + lvtthmr_init = apic_read(APIC_LVTTHMR); } void intel_init_thermal(struct cpuinfo_x86 *c) @@ -260,8 +388,7 @@ void intel_init_thermal(struct cpuinfo_x int tm2 = 0; u32 l, h; - /* Thermal monitoring depends on ACPI and clock modulation*/ - if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC)) + if (!intel_thermal_supported(c)) return; /* @@ -270,7 +397,22 @@ void intel_init_thermal(struct cpuinfo_x * since it might be delivered via SMI already: */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); - h = apic_read(APIC_LVTTHMR); + + h = lvtthmr_init; + /* + * The initial value of thermal LVT entries on all APs always reads + * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI + * sequence to them and LVT registers are reset to 0s except for + * the mask bits which are set to 1s when APs receive INIT IPI. + * If BIOS takes over the thermal interrupt and sets its interrupt + * delivery mode to SMI (not fixed), it restores the value that the + * BIOS has programmed on AP based on BSP's info we saved since BIOS + * is always setting the same value for all threads/cores. + */ + if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED) + apic_write(APIC_LVTTHMR, lvtthmr_init); + + if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { printk(KERN_DEBUG "CPU%d: Thermal monitoring handled by SMI\n", cpu); @@ -300,8 +442,35 @@ void intel_init_thermal(struct cpuinfo_x apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + (l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + (l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE)) + & ~PACKAGE_THERM_INT_PLN_ENABLE, h); + else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } smp_thermal_vector = intel_thermal_interrupt; @@ -312,8 +481,8 @@ void intel_init_thermal(struct cpuinfo_x l = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); - printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", - cpu, tm2 ? "TM2" : "TM1"); + printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n", + tm2 ? "TM2" : "TM1"); /* enable thermal throttle processing */ atomic_set(&therm_throt_en, 1); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mcheck/threshold.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/threshold.c --- linux-2.6.32/arch/x86/kernel/cpu/mcheck/threshold.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mcheck/threshold.c 2015-03-10 13:24:02.000000000 -0700 @@ -8,6 +8,7 @@ #include #include #include +#include static void default_threshold_interrupt(void) { @@ -17,13 +18,24 @@ static void default_threshold_interrupt( void (*mce_threshold_vector)(void) = default_threshold_interrupt; -asmlinkage void smp_threshold_interrupt(void) +static inline void __smp_threshold_interrupt(void) { - exit_idle(); - irq_enter(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); - irq_exit(); - /* Ack only at the end to avoid potential reentry */ - ack_APIC_irq(); +} + +asmlinkage void smp_threshold_interrupt(void) +{ + entering_irq(); + __smp_threshold_interrupt(); + exiting_ack_irq(); +} + +asmlinkage void smp_trace_threshold_interrupt(void) +{ + entering_irq(); + trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); + __smp_threshold_interrupt(); + trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); + exiting_ack_irq(); } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mkcapflags.pl linux-2.6.32.ovz/arch/x86/kernel/cpu/mkcapflags.pl --- linux-2.6.32/arch/x86/kernel/cpu/mkcapflags.pl 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mkcapflags.pl 2015-03-10 13:22:35.000000000 -0700 @@ -9,7 +9,7 @@ open(IN, "< $in\0") or die "$0: cannot open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; print OUT "#include \n\n"; -print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; +print OUT "const char * const x86_cap_flags[RHNCAPINTS*32] = {\n"; while (defined($line = )) { if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mshyperv.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mshyperv.c --- linux-2.6.32/arch/x86/kernel/cpu/mshyperv.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mshyperv.c 2015-03-10 13:24:12.000000000 -0700 @@ -0,0 +1,156 @@ +/* + * HyperV Detection code. + * + * Copyright (C) 2010, Novell, Inc. + * Author : K. Y. Srinivasan + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +struct ms_hyperv_info ms_hyperv; +EXPORT_SYMBOL_GPL(ms_hyperv); + +static bool __init ms_hyperv_platform(void) +{ + u32 eax; + u32 hyp_signature[3]; + + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + /* + * Xen and KVM emulates Hyper-V to support enlightened Windows. + * Check to see first if we are on a Xen or KVM Hypervisor. + */ + if (xen_cpuid_base() || kvm_para_available()) + return false; + + cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, + &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); + + return eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12); +} + +static cycle_t read_hv_clock(struct clocksource *arg) +{ + cycle_t current_tick; + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in + * 100 nanosecond units. + */ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs = { + .name = "hyperv_clocksource", + .rating = 400, /* use this when running on Hyperv*/ + .read = read_hv_clock, + .mask = CLOCKSOURCE_MASK(64), +}; + +static void __init ms_hyperv_init_platform(void) +{ + u64 hv_lapic_frequency; + + /* + * Extract the features and hints + */ + ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); + + printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", + ms_hyperv.features, ms_hyperv.hints); + +#ifdef CONFIG_X86_LOCAL_APIC + if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { + /* + * Get the APIC frequency. + */ + rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); + lapic_timer_frequency = hv_lapic_frequency; + printk(KERN_INFO "HyperV: LAPIC Timer Frequency: %#x\n", + lapic_timer_frequency); + + } +#endif + + if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) + clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); + +#ifdef CONFIG_X86_IO_APIC + no_timer_check = 1; +#endif +} + +const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft HyperV", + .detect = ms_hyperv_platform, + .init_platform = ms_hyperv_init_platform, +}; +EXPORT_SYMBOL(x86_hyper_ms_hyperv); + +#if defined(CONFIG_HYPERV) || defined(CONFIG_HYPERV_MODULE) +static int vmbus_irq = -1; +static irq_handler_t vmbus_isr; + +void hv_register_vmbus_handler(int irq, irq_handler_t handler) +{ + /* + * Setup the IDT for hypervisor callback. + */ + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + + vmbus_irq = irq; + vmbus_isr = handler; +} + +void hyperv_vector_handler(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; + + irq_enter(); + exit_idle(); + + desc = irq_to_desc(vmbus_irq); + + if (desc) + generic_handle_irq_desc(vmbus_irq, desc); + + irq_exit(); + set_irq_regs(old_regs); +} +#else +void hv_register_vmbus_handler(int irq, irq_handler_t handler) +{ +} +#endif +EXPORT_SYMBOL_GPL(hv_register_vmbus_handler); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mtrr/cleanup.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mtrr/cleanup.c --- linux-2.6.32/arch/x86/kernel/cpu/mtrr/cleanup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mtrr/cleanup.c 2015-03-10 13:23:52.000000000 -0700 @@ -22,10 +22,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -34,11 +34,6 @@ #include "mtrr.h" -struct res_range { - unsigned long start; - unsigned long end; -}; - struct var_mtrr_range_state { unsigned long base_pfn; unsigned long size_pfn; @@ -56,7 +51,7 @@ struct var_mtrr_state { /* Should be related to MTRR_VAR_RANGES nums */ #define RANGE_NUM 256 -static struct res_range __initdata range[RANGE_NUM]; +static struct range __initdata range[RANGE_NUM]; static int __initdata nr_range; static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; @@ -64,117 +59,11 @@ static struct var_mtrr_range_state __ini static int __initdata debug_print; #define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) - -static int __init -add_range(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - /* Out of slots: */ - if (nr_range >= RANGE_NUM) - return nr_range; - - range[nr_range].start = start; - range[nr_range].end = end; - - nr_range++; - - return nr_range; -} - -static int __init -add_range_with_merge(struct res_range *range, int nr_range, - unsigned long start, unsigned long end) -{ - int i; - - /* Try to merge it with old one: */ - for (i = 0; i < nr_range; i++) { - unsigned long final_start, final_end; - unsigned long common_start, common_end; - - if (!range[i].end) - continue; - - common_start = max(range[i].start, start); - common_end = min(range[i].end, end); - if (common_start > common_end + 1) - continue; - - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); - - range[i].start = final_start; - range[i].end = final_end; - return nr_range; - } - - /* Need to add it: */ - return add_range(range, nr_range, start, end); -} - -static void __init -subtract_range(struct res_range *range, unsigned long start, unsigned long end) -{ - int i, j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* Find the new spare: */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static int __init cmp_range(const void *x1, const void *x2) -{ - const struct res_range *r1 = x1; - const struct res_range *r2 = x2; - long start1, start2; - - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; -} - #define BIOS_BUG_MSG KERN_WARNING \ "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" static int __init -x86_get_mtrr_mem_range(struct res_range *range, int nr_range, +x86_get_mtrr_mem_range(struct range *range, int nr_range, unsigned long extra_remove_base, unsigned long extra_remove_size) { @@ -188,13 +77,13 @@ x86_get_mtrr_mem_range(struct res_range continue; base = range_state[i].base_pfn; size = range_state[i].size_pfn; - nr_range = add_range_with_merge(range, nr_range, base, - base + size - 1); + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, + base, base + size - 1); } if (debug_print) { printk(KERN_DEBUG "After WB checking\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } @@ -217,45 +106,37 @@ x86_get_mtrr_mem_range(struct res_range size -= (1<<(20-PAGE_SHIFT)) - base; base = 1<<(20-PAGE_SHIFT); } - subtract_range(range, base, base + size - 1); + subtract_range(range, RANGE_NUM, base, base + size - 1); } if (extra_remove_size) - subtract_range(range, extra_remove_base, + subtract_range(range, RANGE_NUM, extra_remove_base, extra_remove_base + extra_remove_size - 1); - /* get new range num */ - nr_range = 0; - for (i = 0; i < RANGE_NUM; i++) { - if (!range[i].end) - continue; - nr_range++; - } if (debug_print) { printk(KERN_DEBUG "After UC checking\n"); - for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + for (i = 0; i < RANGE_NUM; i++) { + if (!range[i].end) + continue; + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); + } } /* sort the ranges */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + nr_range = clean_sort_range(range, RANGE_NUM); if (debug_print) { printk(KERN_DEBUG "After sorting\n"); for (i = 0; i < nr_range; i++) - printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", + printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n", range[i].start, range[i].end + 1); } - /* clear those is not used */ - for (i = nr_range; i < RANGE_NUM; i++) - memset(&range[i], 0, sizeof(range[i])); - return nr_range; } #ifdef CONFIG_MTRR_SANITIZER -static unsigned long __init sum_ranges(struct res_range *range, int nr_range) +static unsigned long __init sum_ranges(struct range *range, int nr_range) { unsigned long sum = 0; int i; @@ -590,7 +471,7 @@ static int __init parse_mtrr_spare_reg(c early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); static int __init -x86_setup_var_mtrrs(struct res_range *range, int nr_range, +x86_setup_var_mtrrs(struct range *range, int nr_range, u64 chunk_size, u64 gran_size) { struct var_mtrr_state var_state; @@ -689,8 +570,6 @@ static int __init mtrr_need_cleanup(void continue; if (!size) type = MTRR_NUM_TYPES; - if (type == MTRR_TYPE_WRPROT) - type = MTRR_TYPE_UNCACHABLE; num[type]++; } @@ -713,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u6 unsigned long x_remove_base, unsigned long x_remove_size, int i) { - static struct res_range range_new[RANGE_NUM]; + static struct range range_new[RANGE_NUM]; unsigned long range_sums_new; static int nr_range_new; int num_reg; @@ -840,10 +719,10 @@ int __init mtrr_cleanup(unsigned address * [0, 1M) should always be covered by var mtrr with WB * and fixed mtrrs should take effect before var mtrr for it: */ - nr_range = add_range_with_merge(range, nr_range, 0, + nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0, (1ULL<<(20 - PAGE_SHIFT)) - 1); /* Sort the ranges: */ - sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + sort_range(range, nr_range); range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM covered: %ldM\n", @@ -948,7 +827,7 @@ int __init amd_special_default_mtrr(void if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) + if (boot_cpu_data.x86 < 0xf) return 0; /* In case some hypervisor doesn't pass SYSCFG through: */ if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mtrr/generic.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mtrr/generic.c --- linux-2.6.32/arch/x86/kernel/cpu/mtrr/generic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mtrr/generic.c 2015-03-10 13:22:51.000000000 -0700 @@ -464,7 +464,8 @@ static void generic_get_mtrr(unsigned in tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND); mask_lo = tmp; } } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mtrr/if.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mtrr/if.c --- linux-2.6.32/arch/x86/kernel/cpu/mtrr/if.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mtrr/if.c 2015-03-10 13:21:24.000000000 -0700 @@ -4,6 +4,7 @@ #include #include #include +#include #include #define LINE_SIZE 80 @@ -133,8 +134,7 @@ mtrr_write(struct file *file, const char return -EINVAL; base = simple_strtoull(line + 5, &ptr, 0); - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); if (strncmp(ptr, "size=", 5)) return -EINVAL; @@ -142,14 +142,11 @@ mtrr_write(struct file *file, const char size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr); if (strncmp(ptr, "type=", 5)) return -EINVAL; - ptr += 5; - while (isspace(*ptr)) - ptr++; + ptr = skip_spaces(ptr + 5); for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) diff -uprN linux-2.6.32/arch/x86/kernel/cpu/mtrr/main.c linux-2.6.32.ovz/arch/x86/kernel/cpu/mtrr/main.c --- linux-2.6.32/arch/x86/kernel/cpu/mtrr/main.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/mtrr/main.c 2015-03-10 13:24:10.000000000 -0700 @@ -35,6 +35,7 @@ #include /* FIXME: kvm_para.h needs this */ +#include #include #include #include @@ -49,9 +50,13 @@ #include #include #include +#include #include "mtrr.h" +/* arch_phys_wc_add returns an MTRR register index plus this offset. */ +#define MTRR_TO_PHYS_WC_OFFSET 1000 + u32 num_var_ranges; unsigned int mtrr_usage_table[MTRR_MAX_VAR_RANGES]; @@ -65,7 +70,7 @@ static struct mtrr_ops *mtrr_ops[X86_VEN struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type); + unsigned long size, mtrr_type type, bool force); void set_mtrr_ops(struct mtrr_ops *ops) { @@ -135,8 +140,6 @@ static void __init init_table(void) } struct set_mtrr_data { - atomic_t count; - atomic_t gate; unsigned long smp_base; unsigned long smp_size; unsigned int smp_reg; @@ -144,40 +147,38 @@ struct set_mtrr_data { }; /** - * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_rendezvous_handler - Work done in the synchronization handler. Executed + * by all the CPUs. + * @info: pointer to mtrr configuration data * * Returns nothing. */ -static void ipi_handler(void *info) +static int mtrr_rendezvous_handler(void *info) { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; - unsigned long flags; - - local_irq_save(flags); - atomic_dec(&data->count); - while (!atomic_read(&data->gate)) - cpu_relax(); - - /* The master has cleared me to execute */ + /* + * We use this same function to initialize the mtrrs during boot, + * resume, runtime cpu online and on an explicit request to set a + * specific MTRR. + * + * During boot or suspend, the state of the boot cpu's mtrrs has been + * saved, and we want to replicate that across all the cpus that come + * online (either at the end of boot or resume or during a runtime cpu + * online). If we're doing that, @reg is set to something special and on + * all the cpu's we do mtrr_if->set_all() (On the logical cpu that + * started the boot/resume sequence, this might be a duplicate + * set_all()). + */ if (data->smp_reg != ~0U) { mtrr_if->set(data->smp_reg, data->smp_base, data->smp_size, data->smp_type); - } else if (mtrr_aps_delayed_init) { - /* - * Initialize the MTRRs inaddition to the synchronisation. - */ + } else if (mtrr_aps_delayed_init || !cpu_online(smp_processor_id())) { mtrr_if->set_all(); } - - atomic_dec(&data->count); - while (atomic_read(&data->gate)) - cpu_relax(); - - atomic_dec(&data->count); - local_irq_restore(flags); #endif + return 0; } static inline int types_compatible(mtrr_type type1, mtrr_type type2) @@ -197,7 +198,7 @@ static inline int types_compatible(mtrr_ * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: * - * 1. Send IPI to do the following: + * 1. Queue work to do the following on all processors: * 2. Disable Interrupts * 3. Wait for all procs to do so * 4. Enter no-fill cache mode @@ -213,81 +214,37 @@ static inline int types_compatible(mtrr_ * 14. Wait for buddies to catch up * 15. Enable interrupts. * - * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait - * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each - * CPU goes through the transition of updating MTRRs. - * The CPU vendors may each do it differently, - * so we call mtrr_if->set() callback and let them take care of it. - * When they're done, they again decrement data->count and wait for data.gate - * to be reset. - * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag - * Everyone then enables interrupts and we all continue on. + * What does that mean for us? Well, stop_machine() will ensure that + * the rendezvous handler is started on each CPU. And in lockstep they + * do the state transition of disabling interrupts, updating MTRR's + * (the CPU vendors may each do it differently, so we call mtrr_if->set() + * callback and let them take care of it.) and enabling interrupts. * * Note that the mechanism is the same for UP systems, too; all the SMP stuff * becomes nops. */ static void -set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type) +set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type, bool force) { - struct set_mtrr_data data; - unsigned long flags; - - data.smp_reg = reg; - data.smp_base = base; - data.smp_size = size; - data.smp_type = type; - atomic_set(&data.count, num_booting_cpus() - 1); - - /* Make sure data.count is visible before unleashing other CPUs */ - smp_wmb(); - atomic_set(&data.gate, 0); - - /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 0) != 0) - panic("mtrr: timed out waiting for other CPUs\n"); + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; + stop_machine(mtrr_rendezvous_handler, &data, cpu_online_mask); +} + +static void set_mtrr_from_inactive_cpu(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data = { .smp_reg = reg, + .smp_base = base, + .smp_size = size, + .smp_type = type + }; - local_irq_save(flags); - - while (atomic_read(&data.count)) - cpu_relax(); - - /* Ok, reset count and toggle gate */ - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 1); - - /* Do our MTRR business */ - - /* - * HACK! - * We use this same function to initialize the mtrrs on boot. - * The state of the boot cpu's mtrrs has been saved, and we want - * to replicate across all the APs. - * If we're doing that @reg is set to something special... - */ - if (reg != ~0U) - mtrr_if->set(reg, base, size, type); - else if (!mtrr_aps_delayed_init) - mtrr_if->set_all(); - - /* Wait for the others */ - while (atomic_read(&data.count)) - cpu_relax(); - - atomic_set(&data.count, num_booting_cpus() - 1); - smp_wmb(); - atomic_set(&data.gate, 0); - - /* - * Wait here for everyone to have seen the gate change - * So we're the last ones to touch 'data' - */ - while (atomic_read(&data.count)) - cpu_relax(); - - local_irq_restore(flags); + stop_machine_from_inactive_cpu(mtrr_rendezvous_handler, &data, + cpu_callout_mask); } /** @@ -409,7 +366,7 @@ int mtrr_add_page(unsigned long base, un /* Search for an empty MTRR */ i = mtrr_if->get_free_region(base, size, replace); if (i >= 0) { - set_mtrr(i, base, size, type); + set_mtrr(i, base, size, type, false); if (likely(replace < 0)) { mtrr_usage_table[i] = 1; } else { @@ -417,7 +374,7 @@ int mtrr_add_page(unsigned long base, un if (increment) mtrr_usage_table[i]++; if (unlikely(replace != i)) { - set_mtrr(replace, 0, 0, 0); + set_mtrr(replace, 0, 0, 0, false); mtrr_usage_table[replace] = 0; } } @@ -544,7 +501,7 @@ int mtrr_del_page(int reg, unsigned long goto out; } if (--mtrr_usage_table[reg] < 1) - set_mtrr(reg, 0, 0, 0); + set_mtrr(reg, 0, 0, 0, false); error = reg; out: mutex_unlock(&mtrr_mutex); @@ -574,6 +531,73 @@ int mtrr_del(int reg, unsigned long base } EXPORT_SYMBOL(mtrr_del); +/** + * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable + * @base: Physical base address + * @size: Size of region + * + * If PAT is available, this does nothing. If PAT is unavailable, it + * attempts to add a WC MTRR covering size bytes starting at base and + * logs an error if this fails. + * + * Drivers must store the return value to pass to mtrr_del_wc_if_needed, + * but drivers should not try to interpret that return value. + */ +int arch_phys_wc_add(unsigned long base, unsigned long size) +{ + int ret; + + if (pat_enabled) + return 0; /* Success! (We don't need to do anything.) */ + + ret = mtrr_add(base, size, MTRR_TYPE_WRCOMB, true); + if (ret < 0) { + pr_warn("Failed to add WC MTRR for [%p-%p]; performance may suffer.", + (void *)base, (void *)(base + size - 1)); + return ret; + } + return ret + MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL(arch_phys_wc_add); + +/* + * arch_phys_wc_del - undoes arch_phys_wc_add + * @handle: Return value from arch_phys_wc_add + * + * This cleans up after mtrr_add_wc_if_needed. + * + * The API guarantees that mtrr_del_wc_if_needed(error code) and + * mtrr_del_wc_if_needed(0) do nothing. + */ +void arch_phys_wc_del(int handle) +{ + if (handle >= 1) { + WARN_ON(handle < MTRR_TO_PHYS_WC_OFFSET); + mtrr_del(handle - MTRR_TO_PHYS_WC_OFFSET, 0, 0); + } +} +EXPORT_SYMBOL(arch_phys_wc_del); + +/* + * phys_wc_to_mtrr_index - translates arch_phys_wc_add's return value + * @handle: Return value from arch_phys_wc_add + * + * This will turn the return value from arch_phys_wc_add into an mtrr + * index suitable for debugging. + * + * Note: There is no legitimate use for this function, except possibly + * in printk line. Alas there is an illegitimate use in some ancient + * drm ioctls. + */ +int phys_wc_to_mtrr_index(int handle) +{ + if (handle < MTRR_TO_PHYS_WC_OFFSET) + return -1; + else + return handle - MTRR_TO_PHYS_WC_OFFSET; +} +EXPORT_SYMBOL_GPL(phys_wc_to_mtrr_index); + /* * HACK ALERT! * These should be called implicitly, but we can't yet until all the initcall @@ -619,7 +643,8 @@ static int mtrr_restore(struct sys_devic if (mtrr_value[i].lsize) { set_mtrr(i, mtrr_value[i].lbase, mtrr_value[i].lsize, - mtrr_value[i].ltype); + mtrr_value[i].ltype, + true); } } return 0; @@ -742,7 +767,7 @@ void mtrr_ap_init(void) * 2. cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug * lock to prevent mtrr entry changes */ - set_mtrr(~0U, 0, 0, 0); + set_mtrr_from_inactive_cpu(~0U, 0, 0, 0); } /** @@ -769,7 +794,7 @@ void mtrr_aps_init(void) if (!use_intel()) return; - set_mtrr(~0U, 0, 0, 0); + set_mtrr(~0U, 0, 0, 0, false); mtrr_aps_delayed_init = false; } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perfctr-watchdog.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perfctr-watchdog.c --- linux-2.6.32/arch/x86/kernel/cpu/perfctr-watchdog.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perfctr-watchdog.c 2015-03-10 13:22:38.000000000 -0700 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -68,6 +69,8 @@ static inline unsigned int nmi_perfctr_m /* returns the bit offset of the performance counter register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTR) + return (msr - MSR_F15H_PERF_CTR) >> 1; return msr - MSR_K7_PERFCTR0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -92,6 +95,8 @@ static inline unsigned int nmi_evntsel_m /* returns the bit offset of the event selection register */ switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: + if (msr >= MSR_F15H_PERF_CTL) + return (msr - MSR_F15H_PERF_CTL) >> 1; return msr - MSR_K7_EVNTSEL0; case X86_VENDOR_INTEL: if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -115,17 +120,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsig return !test_bit(counter, perfctr_nmi_owner); } - -/* checks the an msr for availability */ -int avail_to_resrv_perfctr_nmi(unsigned int msr) -{ - unsigned int counter; - - counter = nmi_perfctr_msr_to_bit(msr); - BUG_ON(counter > NMI_MAX_COUNTER_BITS); - - return !test_bit(counter, perfctr_nmi_owner); -} EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); int reserve_perfctr_nmi(unsigned int msr) @@ -691,7 +685,7 @@ static int setup_intel_arch_watchdog(uns cpu_nmi_set_wd_enabled(); apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; @@ -711,11 +705,10 @@ static void probe_nmi_watchdog(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: - if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && - boot_cpu_data.x86 != 16) - return; - wd_ops = &k7_wd_ops; - break; + if (boot_cpu_data.x86 == 6 || + (boot_cpu_data.x86 >= 0xf && boot_cpu_data.x86 <= 0x15)) + wd_ops = &k7_wd_ops; + return; case X86_VENDOR_INTEL: /* Work around where perfctr1 doesn't have a working enable * bit as described in the following errata: diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_amd.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_amd.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_amd.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_amd.c 2015-03-10 13:24:14.000000000 -0700 @@ -0,0 +1,768 @@ +#include +#include +#include +#include +#include + +#include "perf_event.h" + +u32 get_ibs_caps(void) +{ + u32 ibs_caps; + unsigned int max_level; + + if (!boot_cpu_has(X86_FEATURE_IBS)) + return 0; + + /* check IBS cpuid feature flags */ + max_level = cpuid_eax(0x80000000); + if (max_level < IBS_CPUID_FEATURES) + return IBS_CAPS_DEFAULT; + + ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); + if (!(ibs_caps & IBS_CAPS_AVAIL)) + /* cpuid flags not valid */ + return IBS_CAPS_DEFAULT; + + return ibs_caps; +} +EXPORT_SYMBOL(get_ibs_caps); + +static __initconst const u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xb8e9, /* CPU Request to Memory, l+r */ + [ C(RESULT_MISS) ] = 0x98e9, /* CPU Request to Memory, r */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +/* + * Previously calculated offsets + */ +static unsigned int event_offsets[X86_PMC_IDX_MAX] __read_mostly; +static unsigned int count_offsets[X86_PMC_IDX_MAX] __read_mostly; + +/* + * Legacy CPUs: + * 4 counters starting at 0xc0010000 each offset by 1 + * + * CPUs with core performance counter extensions: + * 6 counters starting at 0xc0010200 each offset by 2 + */ +static inline int amd_pmu_addr_offset(int index, bool eventsel) +{ + int offset; + + if (!index) + return index; + + if (eventsel) + offset = event_offsets[index]; + else + offset = count_offsets[index]; + + if (offset) + return offset; + + if (!cpu_has_perfctr_core) + offset = index; + else + offset = index << 1; + + if (eventsel) + event_offsets[index] = offset; + else + count_offsets[index] = offset; + + return offset; +} + +static int amd_core_hw_config(struct perf_event *event) +{ + if (event->attr.exclude_host && event->attr.exclude_guest) + /* + * When HO == GO == 1 the hardware treats that as GO == HO == 0 + * and will count in both modes. We don't want to count in that + * case so we emulate no-counting by setting US = OS = 0. + */ + event->hw.config &= ~(ARCH_PERFMON_EVENTSEL_USR | + ARCH_PERFMON_EVENTSEL_OS); + else if (event->attr.exclude_host) + event->hw.config |= AMD_PERFMON_EVENTSEL_GUESTONLY; + else if (event->attr.exclude_guest) + event->hw.config |= AMD_PERFMON_EVENTSEL_HOSTONLY; + + return 0; +} + +/* + * AMD64 events are detected based on their event codes. + */ +static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc) +{ + return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff); +} + +static inline int amd_is_nb_event(struct hw_perf_event *hwc) +{ + return (hwc->config & 0xe0) == 0xe0; +} + +static inline int amd_has_nb(struct cpu_hw_events *cpuc) +{ + struct amd_nb *nb = cpuc->amd_nb; + + return nb && nb->nb_id != -1; +} + +static int amd_pmu_hw_config(struct perf_event *event) +{ + int ret; + + /* pass precise event sampling to ibs: */ + if (event->attr.precise_ip && get_ibs_caps()) + return -ENOENT; + + if (has_branch_stack(event)) + return -EOPNOTSUPP; + + ret = x86_pmu_hw_config(event); + if (ret) + return ret; + + if (event->attr.type == PERF_TYPE_RAW) + event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; + + return amd_core_hw_config(event); +} + +static void __amd_put_nb_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct amd_nb *nb = cpuc->amd_nb; + int i; + + /* + * need to scan whole list because event may not have + * been assigned during scheduling + * + * no race condition possible because event can only + * be removed on one CPU at a time AND PMU is disabled + * when we come here + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + if (nb->owners[i] == event) { + (void)cmpxchg(nb->owners+i, event, NULL); + break; + } + } +} + + /* + * AMD64 NorthBridge events need special treatment because + * counter access needs to be synchronized across all cores + * of a package. Refer to BKDG section 3.12 + * + * NB events are events measuring L3 cache, Hypertransport + * traffic. They are identified by an event code >= 0xe00. + * They measure events on the NorthBride which is shared + * by all cores on a package. NB events are counted on a + * shared set of counters. When a NB event is programmed + * in a counter, the data actually comes from a shared + * counter. Thus, access to those counters needs to be + * synchronized. + * + * We implement the synchronization such that no two cores + * can be measuring NB events using the same counters. Thus, + * we maintain a per-NB allocation table. The available slot + * is propagated using the event_constraint structure. + * + * We provide only one choice for each NB event based on + * the fact that only NB events have restrictions. Consequently, + * if a counter is available, there is a guarantee the NB event + * will be assigned to it. If no slot is available, an empty + * constraint is returned and scheduling will eventually fail + * for this event. + * + * Note that all cores attached the same NB compete for the same + * counters to host NB events, this is why we use atomic ops. Some + * multi-chip CPUs may have more than one NB. + * + * Given that resources are allocated (cmpxchg), they must be + * eventually freed for others to use. This is accomplished by + * calling __amd_put_nb_event_constraints() + * + * Non NB events are not impacted by this restriction. + */ +static struct event_constraint * +__amd_get_nb_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, + struct event_constraint *c) +{ + struct hw_perf_event *hwc = &event->hw; + struct amd_nb *nb = cpuc->amd_nb; + struct perf_event *old = NULL; + int max = x86_pmu.num_counters; + int i, j, k = -1; + + if (!c) + c = &unconstrained; + + if (cpuc->is_fake) + return c; + + /* + * detect if already present, if so reuse + * + * cannot merge with actual allocation + * because of possible holes + * + * event can already be present yet not assigned (in hwc->idx) + * because of successive calls to x86_schedule_events() from + * hw_perf_group_sched_in() without hw_perf_enable() + */ + for (i = 0; i < max; i++) { + /* + * keep track of first free slot + */ + if (k == -1 && !nb->owners[i]) + k = i; + + /* already present, reuse */ + if (nb->owners[i] == event) + goto done; + } + /* + * not present, so grab a new slot + * starting either at: + */ + if (hwc->idx != -1) { + /* previous assignment */ + i = hwc->idx; + } else if (k != -1) { + /* start from free slot found */ + i = k; + } else { + /* + * event not found, no slot found in + * first pass, try again from the + * beginning + */ + i = 0; + } + j = i; + do { + old = cmpxchg(nb->owners+i, NULL, event); + if (!old) + break; + if (++i == max) + i = 0; + } while (i != j); +done: + if (!old) + return &nb->event_constraints[i]; + + return &emptyconstraint; +} + +static struct amd_nb *amd_alloc_nb(int cpu) +{ + struct amd_nb *nb; + int i; + + nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + if (!nb) + return NULL; + + nb->nb_id = -1; + + /* + * initialize all possible NB constraints + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + __set_bit(i, nb->event_constraints[i].idxmsk); + nb->event_constraints[i].weight = 1; + } + return nb; +} + +static int amd_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + WARN_ON_ONCE(cpuc->amd_nb); + + if (boot_cpu_data.x86_max_cores < 2) + return NOTIFY_OK; + + cpuc->amd_nb = amd_alloc_nb(cpu); + if (!cpuc->amd_nb) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void amd_pmu_cpu_starting(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct amd_nb *nb; + int i, nb_id; + + cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + nb_id = amd_get_nb_id(cpu); + WARN_ON_ONCE(nb_id == BAD_APICID); + + for_each_online_cpu(i) { + nb = per_cpu(cpu_hw_events, i).amd_nb; + if (WARN_ON_ONCE(!nb)) + continue; + + if (nb->nb_id == nb_id) { + cpuc->kfree_on_online = cpuc->amd_nb; + cpuc->amd_nb = nb; + break; + } + } + + cpuc->amd_nb->nb_id = nb_id; + cpuc->amd_nb->refcnt++; +} + +static void amd_pmu_cpu_dead(int cpu) +{ + struct cpu_hw_events *cpuhw; + + if (boot_cpu_data.x86_max_cores < 2) + return; + + cpuhw = &per_cpu(cpu_hw_events, cpu); + + if (cpuhw->amd_nb) { + struct amd_nb *nb = cpuhw->amd_nb; + + if (nb->nb_id == -1 || --nb->refcnt == 0) + kfree(nb); + + cpuhw->amd_nb = NULL; + } +} + +static struct event_constraint * +amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + /* + * if not NB event or no NB, then no constraints + */ + if (!(amd_has_nb(cpuc) && amd_is_nb_event(&event->hw))) + return &unconstrained; + + return __amd_get_nb_event_constraints(cpuc, event, NULL); +} + +static void amd_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + if (amd_has_nb(cpuc) && amd_is_nb_event(&event->hw)) + __amd_put_nb_event_constraints(cpuc, event); +} + +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *amd_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +/* AMD Family 15h */ + +#define AMD_EVENT_TYPE_MASK 0x000000F0ULL + +#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL +#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL +#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL +#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL +#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL +#define AMD_EVENT_EX_LS 0x000000C0ULL +#define AMD_EVENT_DE 0x000000D0ULL +#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL + +/* + * AMD family 15h event code/PMC mappings: + * + * type = event_code & 0x0F0: + * + * 0x000 FP PERF_CTL[5:3] + * 0x010 FP PERF_CTL[5:3] + * 0x020 LS PERF_CTL[5:0] + * 0x030 LS PERF_CTL[5:0] + * 0x040 DC PERF_CTL[5:0] + * 0x050 DC PERF_CTL[5:0] + * 0x060 CU PERF_CTL[2:0] + * 0x070 CU PERF_CTL[2:0] + * 0x080 IC/DE PERF_CTL[2:0] + * 0x090 IC/DE PERF_CTL[2:0] + * 0x0A0 --- + * 0x0B0 --- + * 0x0C0 EX/LS PERF_CTL[5:0] + * 0x0D0 DE PERF_CTL[2:0] + * 0x0E0 NB NB_PERF_CTL[3:0] + * 0x0F0 NB NB_PERF_CTL[3:0] + * + * Exceptions: + * + * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x003 FP PERF_CTL[3] + * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*) + * 0x00B FP PERF_CTL[3] + * 0x00D FP PERF_CTL[3] + * 0x023 DE PERF_CTL[2:0] + * 0x02D LS PERF_CTL[3] + * 0x02E LS PERF_CTL[3,0] + * 0x031 LS PERF_CTL[2:0] (**) + * 0x043 CU PERF_CTL[2:0] + * 0x045 CU PERF_CTL[2:0] + * 0x046 CU PERF_CTL[2:0] + * 0x054 CU PERF_CTL[2:0] + * 0x055 CU PERF_CTL[2:0] + * 0x08F IC PERF_CTL[0] + * 0x187 DE PERF_CTL[0] + * 0x188 DE PERF_CTL[0] + * 0x0DB EX PERF_CTL[5:0] + * 0x0DC LS PERF_CTL[5:0] + * 0x0DD LS PERF_CTL[5:0] + * 0x0DE LS PERF_CTL[5:0] + * 0x0DF LS PERF_CTL[5:0] + * 0x1C0 EX PERF_CTL[5:3] + * 0x1D6 EX PERF_CTL[5:0] + * 0x1D8 EX PERF_CTL[5:0] + * + * (*) depending on the umask all FPU counters may be used + * (**) only one unitmask enabled at a time + */ + +static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); +static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); +static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); +static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); +static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); + +static struct event_constraint * +amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int event_code = amd_get_event_code(hwc); + + switch (event_code & AMD_EVENT_TYPE_MASK) { + case AMD_EVENT_FP: + switch (event_code) { + case 0x000: + if (!(hwc->config & 0x0000F000ULL)) + break; + if (!(hwc->config & 0x00000F00ULL)) + break; + return &amd_f15_PMC3; + case 0x004: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + break; + return &amd_f15_PMC3; + case 0x003: + case 0x00B: + case 0x00D: + return &amd_f15_PMC3; + } + return &amd_f15_PMC53; + case AMD_EVENT_LS: + case AMD_EVENT_DC: + case AMD_EVENT_EX_LS: + switch (event_code) { + case 0x023: + case 0x043: + case 0x045: + case 0x046: + case 0x054: + case 0x055: + return &amd_f15_PMC20; + case 0x02D: + return &amd_f15_PMC3; + case 0x02E: + return &amd_f15_PMC30; + case 0x031: + if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1) + return &amd_f15_PMC20; + return &emptyconstraint; + case 0x1C0: + return &amd_f15_PMC53; + default: + return &amd_f15_PMC50; + } + case AMD_EVENT_CU: + case AMD_EVENT_IC_DE: + case AMD_EVENT_DE: + switch (event_code) { + case 0x08F: + case 0x187: + case 0x188: + return &amd_f15_PMC0; + case 0x0DB ... 0x0DF: + case 0x1D6: + case 0x1D8: + return &amd_f15_PMC50; + default: + return &amd_f15_PMC20; + } + case AMD_EVENT_NB: + /* moved to perf_event_amd_uncore.c */ + return &emptyconstraint; + default: + return &emptyconstraint; + } +} + +static ssize_t amd_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT) | + (config & AMD64_EVENTSEL_EVENT) >> 24; + + return x86_event_sysfs_show(page, config, event); +} + +static __initconst const struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = x86_pmu_enable_all, + .enable = x86_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = amd_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .addr_offset = amd_pmu_addr_offset, + .event_map = amd_pmu_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = AMD64_NUM_COUNTERS, + .cntval_bits = 48, + .cntval_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, + .get_event_constraints = amd_get_event_constraints, + .put_event_constraints = amd_put_event_constraints, + + .format_attrs = amd_format_attr, + + .cpu_prepare = amd_pmu_cpu_prepare, + .cpu_starting = amd_pmu_cpu_starting, + .events_sysfs_show = amd_event_sysfs_show, + .cpu_dead = amd_pmu_cpu_dead, +}; + +static int setup_event_constraints(void) +{ + if (boot_cpu_data.x86 >= 0x15) + x86_pmu.get_event_constraints = amd_get_event_constraints_f15h; + return 0; +} + +static int setup_perfctr_core(void) +{ + if (!cpu_has_perfctr_core) { + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints_f15h, + KERN_ERR "Odd, counter constraints enabled but no core perfctrs detected!"); + return -ENODEV; + } + + WARN(x86_pmu.get_event_constraints == amd_get_event_constraints, + KERN_ERR "hw perf events core counters need constraints handler!"); + + /* + * If core performance counter extensions exists, we must use + * MSR_F15H_PERF_CTL/MSR_F15H_PERF_CTR msrs. See also + * x86_pmu_addr_offset(). + */ + x86_pmu.eventsel = MSR_F15H_PERF_CTL; + x86_pmu.perfctr = MSR_F15H_PERF_CTR; + x86_pmu.num_counters = AMD64_NUM_COUNTERS_CORE; + + printk(KERN_INFO "perf: AMD core performance counters detected\n"); + + return 0; +} + +__init int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + setup_event_constraints(); + setup_perfctr_core(); + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} + +void amd_pmu_enable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + cpuc->perf_ctr_virt_mask = 0; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); + +void amd_pmu_disable_virt(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + /* + * We only mask out the Host-only bit so that host-only counting works + * when SVM is disabled. If someone sets up a guest-only counter when + * SVM is disabled the Guest-only bits still gets set and the counter + * will not count anything. + */ + cpuc->perf_ctr_virt_mask = AMD_PERFMON_EVENTSEL_HOSTONLY; + + /* Reload all events */ + x86_pmu_disable_all(); + x86_pmu_enable_all(0); +} +EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_amd_uncore.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_amd_uncore.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_amd_uncore.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_amd_uncore.c 2015-03-10 13:24:14.000000000 -0700 @@ -0,0 +1,547 @@ +/* + * Copyright (C) 2013 Advanced Micro Devices, Inc. + * + * Author: Jacob Shin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define NUM_COUNTERS_NB 4 +#define NUM_COUNTERS_L2 4 +#define MAX_COUNTERS NUM_COUNTERS_NB + +#define RDPMC_BASE_NB 6 +#define RDPMC_BASE_L2 10 + +#define COUNTER_SHIFT 16 + +struct amd_uncore { + int id; + int refcnt; + int cpu; + int num_counters; + int rdpmc_base; + u32 msr_base; + cpumask_t *active_mask; + struct pmu *pmu; + struct perf_event *events[MAX_COUNTERS]; + struct amd_uncore *free_when_cpu_online; +}; + +static struct amd_uncore * __percpu *amd_uncore_nb; +static struct amd_uncore * __percpu *amd_uncore_l2; + +static struct pmu amd_nb_pmu; +static struct pmu amd_l2_pmu; + +static cpumask_t amd_nb_active_mask; +static cpumask_t amd_l2_active_mask; + +static bool is_nb_event(struct perf_event *event) +{ + return event->pmu->type == amd_nb_pmu.type; +} + +static bool is_l2_event(struct perf_event *event) +{ + return event->pmu->type == amd_l2_pmu.type; +} + +static struct amd_uncore *event_to_amd_uncore(struct perf_event *event) +{ + if (is_nb_event(event) && amd_uncore_nb) + return *per_cpu_ptr(amd_uncore_nb, event->cpu); + else if (is_l2_event(event) && amd_uncore_l2) + return *per_cpu_ptr(amd_uncore_l2, event->cpu); + + return NULL; +} + +static void amd_uncore_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 prev, new; + s64 delta; + + /* + * since we do not enable counter overflow interrupts, + * we do not have to worry about prev_count changing on us + */ + + prev = local64_read(&hwc->prev_count); + rdpmcl(hwc->event_base_rdpmc, new); + local64_set(&hwc->prev_count, new); + delta = (new << COUNTER_SHIFT) - (prev << COUNTER_SHIFT); + delta >>= COUNTER_SHIFT; + local64_add(delta, &event->count); +} + +static void amd_uncore_start(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + if (flags & PERF_EF_RELOAD) + wrmsrl(hwc->event_base, (u64)local64_read(&hwc->prev_count)); + + hwc->state = 0; + wrmsrl(hwc->config_base, (hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE)); + perf_event_update_userpage(event); +} + +static void amd_uncore_stop(struct perf_event *event, int flags) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); + hwc->state |= PERF_HES_STOPPED; + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + amd_uncore_read(event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int amd_uncore_add(struct perf_event *event, int flags) +{ + int i; + struct amd_uncore *uncore = event_to_amd_uncore(event); + struct hw_perf_event *hwc = &event->hw; + + /* are we already assigned? */ + if (hwc->idx != -1 && uncore->events[hwc->idx] == event) + goto out; + + for (i = 0; i < uncore->num_counters; i++) { + if (uncore->events[i] == event) { + hwc->idx = i; + goto out; + } + } + + /* if not, take the first available counter */ + hwc->idx = -1; + for (i = 0; i < uncore->num_counters; i++) { + if (cmpxchg(&uncore->events[i], NULL, event) == NULL) { + hwc->idx = i; + break; + } + } + +out: + if (hwc->idx == -1) + return -EBUSY; + + hwc->config_base = uncore->msr_base + (2 * hwc->idx); + hwc->event_base = uncore->msr_base + 1 + (2 * hwc->idx); + hwc->event_base_rdpmc = uncore->rdpmc_base + hwc->idx; + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (flags & PERF_EF_START) + amd_uncore_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void amd_uncore_del(struct perf_event *event, int flags) +{ + int i; + struct amd_uncore *uncore = event_to_amd_uncore(event); + struct hw_perf_event *hwc = &event->hw; + + amd_uncore_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < uncore->num_counters; i++) { + if (cmpxchg(&uncore->events[i], event, NULL) == event) + break; + } + + hwc->idx = -1; +} + +static int amd_uncore_event_init(struct perf_event *event) +{ + struct amd_uncore *uncore; + struct hw_perf_event *hwc = &event->hw; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * NB and L2 counters (MSRs) are shared across all cores that share the + * same NB / L2 cache. Interrupts can be directed to a single target + * core, however, event counts generated by processes running on other + * cores cannot be masked out. So we do not support sampling and + * per-thread events. + */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EINVAL; + + /* NB and L2 counters do not have usr/os/guest/host bits */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_host || event->attr.exclude_guest) + return -EINVAL; + + /* and we do not enable counter overflow interrupts */ + hwc->config = event->attr.config & AMD64_RAW_EVENT_MASK_NB; + hwc->idx = -1; + + if (event->cpu < 0) + return -EINVAL; + + uncore = event_to_amd_uncore(event); + if (!uncore) + return -ENODEV; + + /* + * since request can come in to any of the shared cores, we will remap + * to a single common cpu. + */ + event->cpu = uncore->cpu; + + return 0; +} + +static ssize_t amd_uncore_attr_show_cpumask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int n; + cpumask_t *active_mask; + struct pmu *pmu = dev_get_drvdata(dev); + + if (pmu->type == amd_nb_pmu.type) + active_mask = &amd_nb_active_mask; + else if (pmu->type == amd_l2_pmu.type) + active_mask = &amd_l2_active_mask; + else + return 0; + + n = cpulist_scnprintf(buf, PAGE_SIZE - 2, active_mask); + buf[n++] = '\n'; + buf[n] = '\0'; + return n; +} +static DEVICE_ATTR(cpumask, S_IRUGO, amd_uncore_attr_show_cpumask, NULL); + +static struct attribute *amd_uncore_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group amd_uncore_attr_group = { + .attrs = amd_uncore_attrs, +}; + +PMU_FORMAT_ATTR(event, "config:0-7,32-35"); +PMU_FORMAT_ATTR(umask, "config:8-15"); + +static struct attribute *amd_uncore_format_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + NULL, +}; + +static struct attribute_group amd_uncore_format_group = { + .name = "format", + .attrs = amd_uncore_format_attr, +}; + +static const struct attribute_group *amd_uncore_attr_groups[] = { + &amd_uncore_attr_group, + &amd_uncore_format_group, + NULL, +}; + +static struct pmu amd_nb_pmu = { + .attr_groups = amd_uncore_attr_groups, + .name = "amd_nb", + .event_init = amd_uncore_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, +}; + +static struct pmu amd_l2_pmu = { + .attr_groups = amd_uncore_attr_groups, + .name = "amd_l2", + .event_init = amd_uncore_event_init, + .add = amd_uncore_add, + .del = amd_uncore_del, + .start = amd_uncore_start, + .stop = amd_uncore_stop, + .read = amd_uncore_read, +}; + +static struct amd_uncore * __cpuinit amd_uncore_alloc(unsigned int cpu) +{ + return kzalloc_node(sizeof(struct amd_uncore), GFP_KERNEL, + cpu_to_node(cpu)); +} + +static void __cpuinit amd_uncore_cpu_up_prepare(unsigned int cpu) +{ + struct amd_uncore *uncore; + + if (amd_uncore_nb) { + uncore = amd_uncore_alloc(cpu); + uncore->cpu = cpu; + uncore->num_counters = NUM_COUNTERS_NB; + uncore->rdpmc_base = RDPMC_BASE_NB; + uncore->msr_base = MSR_F15H_NB_PERF_CTL; + uncore->active_mask = &amd_nb_active_mask; + uncore->pmu = &amd_nb_pmu; + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + } + + if (amd_uncore_l2) { + uncore = amd_uncore_alloc(cpu); + uncore->cpu = cpu; + uncore->num_counters = NUM_COUNTERS_L2; + uncore->rdpmc_base = RDPMC_BASE_L2; + uncore->msr_base = MSR_F16H_L2I_PERF_CTL; + uncore->active_mask = &amd_l2_active_mask; + uncore->pmu = &amd_l2_pmu; + *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + } +} + +static struct amd_uncore * +__cpuinit amd_uncore_find_online_sibling(struct amd_uncore *this, + struct amd_uncore * __percpu *uncores) +{ + unsigned int cpu; + struct amd_uncore *that; + + for_each_online_cpu(cpu) { + that = *per_cpu_ptr(uncores, cpu); + + if (!that) + continue; + + if (this == that) + continue; + + if (this->id == that->id) { + that->free_when_cpu_online = this; + this = that; + break; + } + } + + this->refcnt++; + return this; +} + +static void __cpuinit amd_uncore_cpu_starting(unsigned int cpu) +{ + unsigned int eax, ebx, ecx, edx; + struct amd_uncore *uncore; + + if (amd_uncore_nb) { + uncore = *per_cpu_ptr(amd_uncore_nb, cpu); + cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); + uncore->id = ecx & 0xff; + + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_nb); + *per_cpu_ptr(amd_uncore_nb, cpu) = uncore; + } + + if (amd_uncore_l2) { + unsigned int apicid = cpu_data(cpu).apicid; + unsigned int nshared; + + uncore = *per_cpu_ptr(amd_uncore_l2, cpu); + cpuid_count(0x8000001d, 2, &eax, &ebx, &ecx, &edx); + nshared = ((eax >> 14) & 0xfff) + 1; + uncore->id = apicid - (apicid % nshared); + + uncore = amd_uncore_find_online_sibling(uncore, amd_uncore_l2); + *per_cpu_ptr(amd_uncore_l2, cpu) = uncore; + } +} + +static void __cpuinit uncore_online(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + + kfree(uncore->free_when_cpu_online); + uncore->free_when_cpu_online = NULL; + + if (cpu == uncore->cpu) + cpumask_set_cpu(cpu, uncore->active_mask); +} + +static void __cpuinit amd_uncore_cpu_online(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_online(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_online(cpu, amd_uncore_l2); +} + +static void __cpuinit uncore_down_prepare(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + unsigned int i; + struct amd_uncore *this = *per_cpu_ptr(uncores, cpu); + + if (this->cpu != cpu) + return; + + /* this cpu is going down, migrate to a shared sibling if possible */ + for_each_online_cpu(i) { + struct amd_uncore *that = *per_cpu_ptr(uncores, i); + + if (cpu == i) + continue; + + if (this == that) { + perf_pmu_migrate_context(this->pmu, cpu, i); + cpumask_clear_cpu(cpu, that->active_mask); + cpumask_set_cpu(i, that->active_mask); + that->cpu = i; + break; + } + } +} + +static void __cpuinit amd_uncore_cpu_down_prepare(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_down_prepare(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_down_prepare(cpu, amd_uncore_l2); +} + +static void __cpuinit uncore_dead(unsigned int cpu, + struct amd_uncore * __percpu *uncores) +{ + struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); + + if (cpu == uncore->cpu) + cpumask_clear_cpu(cpu, uncore->active_mask); + + if (!--uncore->refcnt) + kfree(uncore); + *per_cpu_ptr(amd_uncore_nb, cpu) = NULL; +} + +static void __cpuinit amd_uncore_cpu_dead(unsigned int cpu) +{ + if (amd_uncore_nb) + uncore_dead(cpu, amd_uncore_nb); + + if (amd_uncore_l2) + uncore_dead(cpu, amd_uncore_l2); +} + +static int __cpuinit +amd_uncore_cpu_notifier(struct notifier_block *self, unsigned long action, + void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + amd_uncore_cpu_up_prepare(cpu); + break; + + case CPU_STARTING: + amd_uncore_cpu_starting(cpu); + break; + + case CPU_ONLINE: + amd_uncore_cpu_online(cpu); + break; + + case CPU_DOWN_PREPARE: + amd_uncore_cpu_down_prepare(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DEAD: + amd_uncore_cpu_dead(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block amd_uncore_cpu_notifier_block __cpuinitdata = { + .notifier_call = amd_uncore_cpu_notifier, + .priority = CPU_PRI_PERF + 1, +}; + +static void __init init_cpu_already_online(void *dummy) +{ + unsigned int cpu = smp_processor_id(); + + amd_uncore_cpu_starting(cpu); + amd_uncore_cpu_online(cpu); +} + +static int __init amd_uncore_init(void) +{ + unsigned int cpu; + int ret = -ENODEV; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return -ENODEV; + + if (!cpu_has_topoext) + return -ENODEV; + + if (cpu_has_perfctr_nb) { + amd_uncore_nb = alloc_percpu(struct amd_uncore *); + perf_pmu_register(&amd_nb_pmu, amd_nb_pmu.name, -1); + + printk(KERN_INFO "perf: AMD NB counters detected\n"); + ret = 0; + } + + if (cpu_has_perfctr_l2) { + amd_uncore_l2 = alloc_percpu(struct amd_uncore *); + perf_pmu_register(&amd_l2_pmu, amd_l2_pmu.name, -1); + + printk(KERN_INFO "perf: AMD L2I counters detected\n"); + ret = 0; + } + + if (ret) + return -ENODEV; + + get_online_cpus(); + /* init cpus already online before registering for hotplug notifier */ + for_each_online_cpu(cpu) { + amd_uncore_cpu_up_prepare(cpu); + smp_call_function_single(cpu, init_cpu_already_online, NULL, 1); + } + + register_cpu_notifier(&amd_uncore_cpu_notifier_block); + put_online_cpus(); + + return 0; +} +device_initcall(amd_uncore_init); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event.c 2015-03-10 13:24:14.000000000 -0700 @@ -7,6 +7,7 @@ * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, + * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ @@ -20,608 +21,60 @@ #include #include #include -#include +#include #include +#include +#include #include #include #include +#include +#include +#include + +#include "perf_event.h" + +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ + (unsigned long)(val)); \ + native_write_msr((msr), (u32)((u64)(val)), \ + (u32)((u64)(val) >> 32)); \ +} while (0) +#endif -static u64 perf_event_mask __read_mostly; - -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 - -/* The size of a BTS record in bytes: */ -#define BTS_RECORD_SIZE 24 - -/* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) - -/* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) - - -/* - * Bits in the debugctlmsr controlling branch tracing. - */ -#define X86_DEBUGCTL_TR (1 << 6) -#define X86_DEBUGCTL_BTS (1 << 7) -#define X86_DEBUGCTL_BTINT (1 << 8) -#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) -#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) - -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; - -struct cpu_hw_events { - struct perf_event *events[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long interrupts; - int enabled; - struct debug_store *ds; -}; - -/* - * struct x86_pmu - generic x86 pmu - */ -struct x86_pmu { - const char *name; - int version; - int (*handle_irq)(struct pt_regs *); - void (*disable_all)(void); - void (*enable_all)(void); - void (*enable)(struct hw_perf_event *, int); - void (*disable)(struct hw_perf_event *, int); - unsigned eventsel; - unsigned perfctr; - u64 (*event_map)(int); - u64 (*raw_event)(u64); - int max_events; - int num_events; - int num_events_fixed; - int event_bits; - u64 event_mask; - int apic; - u64 max_period; - u64 intel_ctrl; - void (*enable_bts)(u64 config); - void (*disable_bts)(void); -}; - -static struct x86_pmu x86_pmu __read_mostly; +struct x86_pmu x86_pmu __read_mostly; -static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Event setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_EVENT 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_REG_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_REG_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, -}; - -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} - -/* - * Generalized hw caching related hw_event table, filled - * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'hw_event makes no sense on - * this CPU', any other value means the raw hw_event - * ID. - */ - -#define C(x) PERF_COUNT_HW_CACHE_##x - -static u64 __read_mostly hw_cache_event_ids +u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; - -static const u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static const u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static const u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_REG_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static const u64 amd_hw_cache_event_ids +u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_REG_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} + [PERF_COUNT_HW_CACHE_RESULT_MAX]; /* * Propagate event elapsed time into the generic event. * Can only be executed on the CPU where the event is active. * Returns the delta events processed. */ -static u64 -x86_perf_event_update(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +u64 x86_perf_event_update(struct perf_event *event) { - int shift = 64 - x86_pmu.event_bits; + struct hw_perf_event *hwc = &event->hw; + int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; + int idx = hwc->idx; s64 delta; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* @@ -632,10 +85,10 @@ x86_perf_event_update(struct perf_event * count to the generic event atomically: */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); - rdmsrl(hwc->event_base + idx, new_raw_count); + prev_raw_count = local64_read(&hwc->prev_count); + rdpmcl(hwc->event_base_rdpmc, new_raw_count); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -650,173 +103,152 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } +/* + * Find and validate any extra registers to set up. + */ +static int x86_pmu_extra_regs(u64 config, struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + struct extra_reg *er; + + reg = &event->hw.extra_reg; + + if (!x86_pmu.extra_regs) + return 0; + + for (er = x86_pmu.extra_regs; er->msr; er++) { + if (er->event != (config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + + reg->idx = er->idx; + reg->config = event->attr.config1; + reg->reg = er->msr; + break; + } + return 0; +} + static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); +#ifdef CONFIG_X86_LOCAL_APIC + static bool reserve_pmc_hardware(void) { -#ifdef CONFIG_X86_LOCAL_APIC int i; - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - - for (i = 0; i < x86_pmu.num_events; i++) { - if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + for (i = 0; i < x86_pmu.num_counters; i++) { + if (!reserve_perfctr_nmi(x86_pmu_event_addr(i))) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_events; i++) { - if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + for (i = 0; i < x86_pmu.num_counters; i++) { + if (!reserve_evntsel_nmi(x86_pmu_config_addr(i))) goto eventsel_fail; } -#endif return true; -#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu.eventsel + i); + release_evntsel_nmi(x86_pmu_config_addr(i)); - i = x86_pmu.num_events; + i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu.perfctr + i); - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); + release_perfctr_nmi(x86_pmu_event_addr(i)); return false; -#endif } static void release_pmc_hardware(void) { -#ifdef CONFIG_X86_LOCAL_APIC int i; - for (i = 0; i < x86_pmu.num_events; i++) { - release_perfctr_nmi(x86_pmu.perfctr + i); - release_evntsel_nmi(x86_pmu.eventsel + i); + for (i = 0; i < x86_pmu.num_counters; i++) { + release_perfctr_nmi(x86_pmu_event_addr(i)); + release_evntsel_nmi(x86_pmu_config_addr(i)); } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); -#endif -} - -static inline bool bts_available(void) -{ - return x86_pmu.enable_bts != NULL; } -static inline void init_debug_store_on_cpu(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, - (u32)((u64)(unsigned long)ds), - (u32)((u64)(unsigned long)ds >> 32)); -} +#else -static inline void fini_debug_store_on_cpu(int cpu) -{ - if (!per_cpu(cpu_hw_events, cpu).ds) - return; +static bool reserve_pmc_hardware(void) { return true; } +static void release_pmc_hardware(void) {} - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); -} +#endif -static void release_bts_hardware(void) +static bool check_hw_exists(void) { - int cpu; - - if (!bts_available()) - return; + u64 val, val_new = 0; + int i, reg, ret = 0; - get_online_cpus(); - - for_each_online_cpu(cpu) - fini_debug_store_on_cpu(cpu); - - for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - continue; - - per_cpu(cpu_hw_events, cpu).ds = NULL; - - kfree((void *)(unsigned long)ds->bts_buffer_base); - kfree(ds); + /* + * Check to see if the BIOS enabled any of the counters, if so + * complain and bail. + */ + for (i = 0; i < x86_pmu.num_counters; i++) { + reg = x86_pmu_config_addr(i); + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + goto bios_fail; + } + + if (x86_pmu.num_counters_fixed) { + reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + ret = rdmsrl_safe(reg, &val); + if (ret) + goto msr_fail; + for (i = 0; i < x86_pmu.num_counters_fixed; i++) { + if (val & (0x03 << i*4)) + goto bios_fail; + } } - put_online_cpus(); -} - -static int reserve_bts_hardware(void) -{ - int cpu, err = 0; - - if (!bts_available()) - return 0; - - get_online_cpus(); - - for_each_possible_cpu(cpu) { - struct debug_store *ds; - void *buffer; - - err = -ENOMEM; - buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - ds = kzalloc(sizeof(*ds), GFP_KERNEL); - if (unlikely(!ds)) { - kfree(buffer); - break; - } + /* + * Now write a value and read it back to see if it matches, + * this is needed to detect certain hardware emulators (qemu/kvm) + * that don't trap on the MSR access and always return 0s. + */ + val = 0xabcdUL; + ret = checking_wrmsrl(x86_pmu_event_addr(0), val); + ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new); + if (ret || val != val_new) + goto msr_fail; - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = - ds->bts_buffer_base + BTS_BUFFER_SIZE; - ds->bts_interrupt_threshold = - ds->bts_absolute_maximum - BTS_OVFL_TH; + return true; - per_cpu(cpu_hw_events, cpu).ds = ds; - err = 0; - } +bios_fail: + /* + * We still allow the PMU driver to operate: + */ + printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n"); + printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val); - if (err) - release_bts_hardware(); - else { - for_each_online_cpu(cpu) - init_debug_store_on_cpu(cpu); - } + return true; - put_online_cpus(); +msr_fail: + printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n"); - return err; + return false; } static void hw_perf_event_destroy(struct perf_event *event) { if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { release_pmc_hardware(); - release_bts_hardware(); + release_ds_buffers(); mutex_unlock(&pmc_reserve_mutex); } } @@ -827,8 +259,9 @@ static inline int x86_pmu_initialized(vo } static inline int -set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event) { + struct perf_event_attr *attr = &event->attr; unsigned int cache_type, cache_op, cache_result; u64 config, val; @@ -855,95 +288,20 @@ set_ext_hw_attr(struct hw_perf_event *hw return -EINVAL; hwc->config |= val; - - return 0; -} - -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); + attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result]; + return x86_pmu_extra_regs(val, event); } -/* - * Setup the hardware configuration for a given attr_type - */ -static int __hw_perf_event_init(struct perf_event *event) +int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; - int err; - - if (!x86_pmu_initialized()) - return -ENODEV; - - err = 0; - if (!atomic_inc_not_zero(&active_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_events) == 0) { - if (!reserve_pmc_hardware()) - err = -EBUSY; - else - err = reserve_bts_hardware(); - } - if (!err) - atomic_inc(&active_events); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; - - event->destroy = hw_perf_event_destroy; - - /* - * Generate PMC IRQs: - * (keep 'enabled' bit clear for now) - */ - hwc->config = ARCH_PERFMON_EVENTSEL_INT; - - /* - * Count user and OS events unless requested not to. - */ - if (!attr->exclude_user) - hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!attr->exclude_kernel) - hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (!hwc->sample_period) { + if (!is_sampling_event(event)) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } else { /* * If we have a PMU initialized but no APIC @@ -955,16 +313,11 @@ static int __hw_perf_event_init(struct p return -EOPNOTSUPP; } - /* - * Raw hw_event type provide the config in the hw_event structure - */ - if (attr->type == PERF_TYPE_RAW) { - hwc->config |= x86_pmu.raw_event(attr->config); - return 0; - } + if (attr->type == PERF_TYPE_RAW) + return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, attr); + return set_ext_hw_attr(hwc, event); if (attr->config >= x86_pmu.max_events) return -EINVAL; @@ -983,14 +336,14 @@ static int __hw_perf_event_init(struct p /* * Branch tracing: */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { + if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS && + !attr->freq && hwc->sample_period == 1) { /* BTS is not supported by this architecture. */ - if (!bts_available()) + if (!x86_pmu.bts_active) return -EOPNOTSUPP; /* BTS is currently only allowed for user-mode. */ - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + if (!attr->exclude_kernel) return -EOPNOTSUPP; } @@ -999,216 +352,574 @@ static int __hw_perf_event_init(struct p return 0; } -static void p6_pmu_disable_all(void) +/* + * check that branch_sample_type is compatible with + * settings needed for precise_ip > 1 which implies + * using the LBR to capture ALL taken branches at the + * priv levels of the measurement + */ +static inline int precise_br_compat(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); + u64 m = event->attr.branch_sample_type; + u64 b = 0; - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} + /* must capture all branches */ + if (!(m & PERF_SAMPLE_BRANCH_ANY)) + return 0; -static void intel_pmu_disable_all(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER; - if (!cpuc->enabled) - return; + if (!event->attr.exclude_user) + b |= PERF_SAMPLE_BRANCH_USER; - cpuc->enabled = 0; - barrier(); + if (!event->attr.exclude_kernel) + b |= PERF_SAMPLE_BRANCH_KERNEL; - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + /* + * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86 + */ - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); + return m == b; } -static void amd_pmu_disable_all(void) +int x86_pmu_hw_config(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int idx; + if (event->attr.precise_ip) { + int precise = 0; - if (!cpuc->enabled) - return; + /* Support for constant skid */ + if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) { + precise++; + + /* Support for IP fixup */ + if (x86_pmu.lbr_nr) + precise++; + } + + if (event->attr.precise_ip > precise) + return -EOPNOTSUPP; + /* + * check that PEBS LBR correction does not conflict with + * whatever the user is asking with attr->branch_sample_type + */ + if (event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) { + u64 *br_type = &event->attr.branch_sample_type; + + if (has_branch_stack(event)) { + if (!precise_br_compat(event)) + return -EOPNOTSUPP; + + /* branch_sample_type is compatible */ + + } else { + /* + * user did not specify branch_sample_type + * + * For PEBS fixups, we capture all + * the branches at the priv level of the + * event. + */ + *br_type = PERF_SAMPLE_BRANCH_ANY; + + if (!event->attr.exclude_user) + *br_type |= PERF_SAMPLE_BRANCH_USER; + + if (!event->attr.exclude_kernel) + *br_type |= PERF_SAMPLE_BRANCH_KERNEL; + } + } + } - cpuc->enabled = 0; /* - * ensure we write the disable before we start disabling the - * events proper, so that amd_pmu_enable_event() does the - * right thing. + * Generate PMC IRQs: + * (keep 'enabled' bit clear for now) */ - barrier(); + event->hw.config = ARCH_PERFMON_EVENTSEL_INT; - for (idx = 0; idx < x86_pmu.num_events; idx++) { - u64 val; + /* + * Count user and OS events unless requested not to + */ + if (!event->attr.exclude_user) + event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; + if (!event->attr.exclude_kernel) + event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; - if (!test_bit(idx, cpuc->active_mask)) - continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) - continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (event->attr.type == PERF_TYPE_RAW) + event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; + + return x86_setup_perfctr(event); } -void hw_perf_disable(void) +/* + * Setup the hardware configuration for a given attr_type + */ +static int __x86_pmu_event_init(struct perf_event *event) { + int err; + if (!x86_pmu_initialized()) - return; - return x86_pmu.disable_all(); + return -ENODEV; + + err = 0; + if (!atomic_inc_not_zero(&active_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_events) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + reserve_ds_buffers(); + } + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + + event->destroy = hw_perf_event_destroy; + + event->hw.idx = -1; + event->hw.last_cpu = -1; + event->hw.last_tag = ~0ULL; + + /* mark unused */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + + /* mark not used */ + event->hw.extra_reg.idx = EXTRA_REG_NONE; + event->hw.branch_reg.idx = EXTRA_REG_NONE; + + return x86_pmu.hw_config(event); } -static void p6_pmu_enable_all(void) +void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long val; - - if (cpuc->enabled) - return; + int idx; - cpuc->enabled = 1; - barrier(); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + u64 val; - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(x86_pmu_config_addr(idx), val); + if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(x86_pmu_config_addr(idx), val); + } } -static void intel_pmu_enable_all(void) +static void x86_pmu_disable(struct pmu *pmu) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - if (cpuc->enabled) + if (!x86_pmu_initialized()) return; - cpuc->enabled = 1; + if (!cpuc->enabled) + return; + + cpuc->n_added = 0; + cpuc->enabled = 0; barrier(); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + x86_pmu.disable_all(); +} + +void x86_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_event *event = - cpuc->events[X86_PMC_IDX_FIXED_BTS]; + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; - if (WARN_ON_ONCE(!event)) - return; + if (!test_bit(idx, cpuc->active_mask)) + continue; - intel_pmu_enable_bts(event->hw.config); + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } } -static void amd_pmu_enable_all(void) +static struct pmu pmu; + +static inline int is_x86_event(struct perf_event *event) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - int idx; + return event->pmu == &pmu; +} - if (cpuc->enabled) - return; +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { + int weight; + int event; /* event index */ + int counter; /* counter index */ + int unassigned; /* number of events to be assigned left */ + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; - cpuc->enabled = 1; - barrier(); +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define SCHED_STATES_MAX 2 - for (idx = 0; idx < x86_pmu.num_events; idx++) { - struct perf_event *event = cpuc->events[idx]; - u64 val; +struct perf_sched { + int max_weight; + int max_events; + struct event_constraint **constraints; + struct sched_state state; + int saved_states; + struct sched_state saved[SCHED_STATES_MAX]; +}; - if (!test_bit(idx, cpuc->active_mask)) - continue; +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, + int num, int wmin, int wmax) +{ + int idx; + + memset(sched, 0, sizeof(*sched)); + sched->max_events = num; + sched->max_weight = wmax; + sched->constraints = c; - val = event->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + for (idx = 0; idx < num; idx++) { + if (c[idx]->weight == wmin) + break; } + + sched->state.event = idx; /* start with min weight */ + sched->state.weight = wmin; + sched->state.unassigned = num; } -void hw_perf_enable(void) +static void perf_sched_save_state(struct perf_sched *sched) { - if (!x86_pmu_initialized()) + if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) return; - x86_pmu.enable_all(); + + sched->saved[sched->saved_states] = sched->state; + sched->saved_states++; } -static inline u64 intel_pmu_get_status(void) +static bool perf_sched_restore_state(struct perf_sched *sched) { - u64 status; + if (!sched->saved_states) + return false; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + sched->saved_states--; + sched->state = sched->saved[sched->saved_states]; - return status; + /* continue with next counter: */ + clear_bit(sched->state.counter++, sched->state.used); + + return true; } -static inline void intel_pmu_ack_status(u64 ack) +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool __perf_sched_find_counter(struct perf_sched *sched) { - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); + struct event_constraint *c; + int idx; + + if (!sched->state.unassigned) + return false; + + if (sched->state.event >= sched->max_events) + return false; + + c = sched->constraints[sched->state.event]; + + /* Prefer fixed purpose counters */ + if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) { + idx = INTEL_PMC_IDX_FIXED; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + } + /* Grab the first unused counter starting with idx */ + idx = sched->state.counter; + for_each_set_bit_cont(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + + return false; + +done: + sched->state.counter = idx; + + if (c->overlap) + perf_sched_save_state(sched); + + return true; } -static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +static bool perf_sched_find_counter(struct perf_sched *sched) { - (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); + while (!__perf_sched_find_counter(sched)) { + if (!perf_sched_restore_state(sched)) + return false; + } + + return true; } -static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) { - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); + struct event_constraint *c; + + if (!sched->state.unassigned || !--sched->state.unassigned) + return false; + + do { + /* next event */ + sched->state.event++; + if (sched->state.event >= sched->max_events) { + /* next weight */ + sched->state.event = 0; + sched->state.weight++; + if (sched->state.weight > sched->max_weight) + return false; + } + c = sched->constraints[sched->state.event]; + } while (c->weight != sched->state.weight); + + sched->state.counter = 0; /* start with first counter */ + + return true; } -static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +/* + * Assign a counter for each event. + */ +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) { - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; + struct perf_sched sched; - mask = 0xfULL << (idx * 4); + perf_sched_init(&sched, constraints, n, wmin, wmax); - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); + do { + if (!perf_sched_find_counter(&sched)) + break; /* failed */ + if (assign) + assign[sched.state.event] = sched.state.counter; + } while (perf_sched_next_event(&sched)); + + return sched.state.unassigned; } -static inline void -p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val = P6_NOP_EVENT; + struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int i, wmin, wmax, num = 0; + struct hw_perf_event *hwc; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { + c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); + constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* + * fastpath, try to reuse previous register + */ + for (i = 0; i < n; i++) { + hwc = &cpuc->event_list[i]->hw; + c = constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; - (void)checking_wrmsrl(hwc->config_base + idx, val); + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + + /* slow path */ + if (i != n) + num = perf_assign_events(constraints, n, wmin, wmax, assign); + /* + * scheduling failed or is just a simulation, + * free resources if necessary + */ + if (!assign || num) { + for (i = 0; i < n; i++) { + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); + } + } + return num ? -ENOSPC : 0; } -static inline void -intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +/* + * dogrp: true if must collect siblings events (group) + * returns total number of events and error code + */ +static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp) { - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); - return; + struct perf_event *event; + int n, max_count; + + max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; + + /* current number of events already accepted */ + n = cpuc->n_events; + + if (is_x86_event(leader)) { + if (n >= max_count) + return -ENOSPC; + cpuc->event_list[n] = leader; + n++; } + if (!dogrp) + return n; - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); - return; + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (!is_x86_event(event) || + event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -ENOSPC; + + cpuc->event_list[n] = event; + n++; + } + return n; +} + +static inline void x86_assign_hw_event(struct perf_event *event, + struct cpu_hw_events *cpuc, int i) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = cpuc->assign[i]; + hwc->last_cpu = smp_processor_id(); + hwc->last_tag = ++cpuc->tags[i]; + + if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) { + hwc->config_base = 0; + hwc->event_base = 0; + } else if (hwc->idx >= INTEL_PMC_IDX_FIXED) { + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - INTEL_PMC_IDX_FIXED); + hwc->event_base_rdpmc = (hwc->idx - INTEL_PMC_IDX_FIXED) | 1<<30; + } else { + hwc->config_base = x86_pmu_config_addr(hwc->idx); + hwc->event_base = x86_pmu_event_addr(hwc->idx); + hwc->event_base_rdpmc = x86_pmu_rdpmc_index(hwc->idx); } +} - x86_pmu_disable_event(hwc, idx); +static inline int match_prev_assignment(struct hw_perf_event *hwc, + struct cpu_hw_events *cpuc, + int i) +{ + return hwc->idx == cpuc->assign[i] && + hwc->last_cpu == smp_processor_id() && + hwc->last_tag == cpuc->tags[i]; } -static inline void -amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) +static void x86_pmu_start(struct perf_event *event, int flags); + +static void x86_pmu_enable(struct pmu *pmu) { - x86_pmu_disable_event(hwc, idx); + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + struct hw_perf_event *hwc; + int i, added = cpuc->n_added; + + if (!x86_pmu_initialized()) + return; + + if (cpuc->enabled) + return; + + if (cpuc->n_added) { + int n_running = cpuc->n_events - cpuc->n_added; + /* + * apply assignment obtained either from + * hw_perf_group_sched_in() or x86_pmu_enable() + * + * step1: save events moving to new counters + * step2: reprogram moved events into new counters + */ + for (i = 0; i < n_running; i++) { + event = cpuc->event_list[i]; + hwc = &event->hw; + + /* + * we can avoid reprogramming counter if: + * - assigned same counter as last time + * - running on same CPU as last time + * - no other event has used the counter since + */ + if (hwc->idx == -1 || + match_prev_assignment(hwc, cpuc, i)) + continue; + + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + x86_pmu_stop(event, PERF_EF_UPDATE); + } + + for (i = 0; i < cpuc->n_events; i++) { + event = cpuc->event_list[i]; + hwc = &event->hw; + + if (!match_prev_assignment(hwc, cpuc, i)) + x86_assign_hw_event(event, cpuc, i); + else if (i < n_running) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + x86_pmu_start(event, PERF_EF_RELOAD); + } + cpuc->n_added = 0; + perf_events_lapic_init(); + } + + cpuc->enabled = 1; + barrier(); + + x86_pmu.enable_all(added); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -1217,30 +928,29 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_M * Set the next IRQ period, based on the hwc->period_left value. * To be called with the event disabled in hw: */ -static int -x86_perf_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, int idx) +int x86_perf_event_set_period(struct perf_event *event) { - s64 left = atomic64_read(&hwc->period_left); + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; - int err, ret = 0; + int ret = 0, idx = hwc->idx; - if (idx == X86_PMC_IDX_FIXED_BTS) + if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; /* - * If we are way outside a reasoable range then just skip forward: + * If we are way outside a reasonable range then just skip forward: */ if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -1259,192 +969,119 @@ x86_perf_event_set_period(struct perf_ev * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); - err = checking_wrmsrl(hwc->event_base + idx, - (u64)(-left) & x86_pmu.event_mask); + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + + /* + * Due to erratum on certan cpu we need + * a second write to be sure the register + * is updated properly + */ + if (x86_pmu.perfctr_second_write) { + wrmsrl(hwc->event_base, + (u64)(-left) & x86_pmu.cntval_mask); + } perf_event_update_userpage(event); return ret; } -static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +void x86_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - u64 val; - - val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - - -static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_events).enabled) - return; - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; + if (cpuc->enabled) { + __x86_pmu_enable_event(&event->hw, + ARCH_PERFMON_EVENTSEL_ENABLE); } - - x86_pmu_enable_event(hwc, idx); } -static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +/* + * Add a single event to the PMU. + * + * The event is added to the group of enabled events + * but only if it can be scehduled with existing events. + */ +static int x86_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc; + int assign[X86_PMC_IDX_MAX]; + int n, n0, ret; - if (cpuc->enabled) - x86_pmu_enable_event(hwc, idx); -} + hwc = &event->hw; -static int -fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) -{ - unsigned int hw_event; + perf_pmu_disable(event->pmu); + n0 = cpuc->n_events; + ret = n = collect_events(cpuc, event, false); + if (ret < 0) + goto out; - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; + /* + * If group events scheduling transaction was started, + * skip the schedulability test here, it will be performed + * at commit time (->commit_txn) as a whole + */ + if (cpuc->group_flag & PERF_EVENT_TXN) + goto done_collect; - if (!x86_pmu.num_events_fixed) - return -1; + ret = x86_pmu.schedule_events(cpuc, n, assign); + if (ret) + goto out; + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n*sizeof(int)); - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; +done_collect: + cpuc->n_events = n; + cpuc->n_added += n - n0; + cpuc->n_txn += n - n0; - return -1; + ret = 0; +out: + perf_pmu_enable(event->pmu); + return ret; } -/* - * Find a PMC slot for the freshly enabled / scheduled in event: - */ -static int x86_pmu_enable(struct perf_event *event) +static void x86_pmu_start(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int idx; - - idx = fixed_mode_idx(event, hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->event_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed event, if that is already taken - * then try to get a generic event: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; + int idx = event->hw.idx; - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that event_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->event_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic event again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_events); - if (idx == x86_pmu.num_events) - return -EAGAIN; + if (WARN_ON_ONCE(idx == -1)) + return; - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->event_base = x86_pmu.perfctr; + if (flags & PERF_EF_RELOAD) { + WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + x86_perf_event_set_period(event); } - perf_events_lapic_init(); - - x86_pmu.disable(hwc, idx); + event->hw.state = 0; cpuc->events[idx] = event; - set_bit(idx, cpuc->active_mask); - - x86_perf_event_set_period(event, hwc, idx); - x86_pmu.enable(hwc, idx); - + __set_bit(idx, cpuc->active_mask); + __set_bit(idx, cpuc->running); + x86_pmu.enable(event); perf_event_update_userpage(event); - - return 0; -} - -static void x86_pmu_unthrottle(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->events[hwc->idx] != event)) - return; - - x86_pmu.enable(hwc, hwc->idx); } void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + u64 pebs; struct cpu_hw_events *cpuc; unsigned long flags; int cpu, idx; - if (!x86_pmu.num_events) + if (!x86_pmu.num_counters) return; local_irq_save(flags); @@ -1457,18 +1094,20 @@ void perf_event_print_debug(void) rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); + pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); - for (idx = 0; idx < x86_pmu.num_events; idx++) { - rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu.perfctr + idx, pmc_count); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl); + rdmsrl(x86_pmu_event_addr(idx), pmc_count); prev_left = per_cpu(pmc_prev_left[idx], cpu); @@ -1479,7 +1118,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1488,288 +1127,108 @@ void perf_event_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) +void x86_pmu_stop(struct perf_event *event, int flags) { - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; - if (!event) - return; + if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + x86_pmu.disable(event); + cpuc->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + } - if (!ds) - return; + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + x86_perf_event_update(event); + hwc->state |= PERF_HES_UPTODATE; + } +} - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = event->hw.last_period; - data.addr = 0; - regs.ip = 0; +static void x86_pmu_del(struct perf_event *event, int flags) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. + * If we're called during a txn, we don't need to do anything. + * The events never got scheduled and ->cancel_txn will truncate + * the event_list. */ - perf_prepare_sample(&header, &data, event, ®s); - - if (perf_output_begin(&handle, event, - header.size * (top - at), 1, 1)) + if (cpuc->group_flag & PERF_EVENT_TXN) return; - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, event); - } - - perf_output_end(&handle); - - /* There's new data available. */ - event->hw.interrupts++; - event->pending_kill = POLL_IN; -} - -static void x86_pmu_disable(struct perf_event *event) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); - - /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the event: - */ - barrier(); - - /* - * Drain the remaining delta count out of a event - * that we are disabling: - */ - x86_perf_event_update(event, hwc, idx); - - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); - - cpuc->events[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_event_update_userpage(event); -} - -/* - * Save and restart an expired event. Called by NMI contexts, - * so it has to be careful about preempting normal event ops: - */ -static int intel_pmu_save_and_restart(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int idx = hwc->idx; - int ret; - - x86_perf_event_update(event, hwc, idx); - ret = x86_perf_event_set_period(event, hwc, idx); - - if (event->state == PERF_EVENT_STATE_ACTIVE) - intel_pmu_enable_event(hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; - unsigned long flags; - int idx; + x86_pmu_stop(event, PERF_EF_UPDATE); - if (!x86_pmu.num_events) - return; + for (i = 0; i < cpuc->n_events; i++) { + if (event == cpuc->event_list[i]) { - local_irq_save(flags); + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(cpuc, event); - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + while (++i < cpuc->n_events) + cpuc->event_list[i-1] = cpuc->event_list[i]; - for (idx = 0; idx < x86_pmu.num_events; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + --cpuc->n_events; + break; + } } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); + perf_event_update_userpage(event); } -static int p6_pmu_handle_irq(struct pt_regs *regs) +int x86_pmu_handle_irq(struct pt_regs *regs) { struct perf_sample_data data; struct cpu_hw_events *cpuc; struct perf_event *event; - struct hw_perf_event *hwc; int idx, handled = 0; u64 val; - data.addr = 0; - cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_events; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - event = cpuc->events[idx]; - hwc = &event->hw; - - val = x86_perf_event_update(event, hwc, idx); - if (val & (1ULL << (x86_pmu.event_bits - 1))) - continue; - - /* - * event overflow - */ - handled = 1; - data.period = event->hw.last_period; - - if (!x86_perf_event_set_period(event, hwc, idx)) - continue; - - if (perf_event_overflow(event, 1, &data, regs)) - p6_pmu_disable_event(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_events); - - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } - - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfevents: irq loop stuck!\n"); - perf_event_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } - - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_event *event = cpuc->events[bit]; - - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; - - if (!intel_pmu_save_and_restart(event)) - continue; - - data.period = event->hw.last_period; - - if (perf_event_overflow(event, 1, &data, regs)) - intel_pmu_disable_event(&event->hw, bit); - } - - intel_pmu_ack_status(ack); - /* - * Repeat if there is more work to be done: + * Some chipsets need to unmask the LVTPC in a particular spot + * inside the nmi handler. As a result, the unmasking was pushed + * into all the nmi handlers. + * + * This generic handler doesn't seem to have any issues where the + * unmasking occurs so it was left at the top. */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; -} - -static int amd_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_events *cpuc; - struct perf_event *event; - struct hw_perf_event *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_events); + apic_write(APIC_LVTPC, APIC_DM_NMI); - for (idx = 0; idx < x86_pmu.num_events; idx++) { - if (!test_bit(idx, cpuc->active_mask)) + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) { + /* + * Though we deactivated the counter some cpus + * might still deliver spurious interrupts still + * in flight. Catch them: + */ + if (__test_and_clear_bit(idx, cpuc->running)) + handled++; continue; + } event = cpuc->events[idx]; - hwc = &event->hw; - val = x86_perf_event_update(event, hwc, idx); - if (val & (1ULL << (x86_pmu.event_bits - 1))) + val = x86_perf_event_update(event); + if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; /* * event overflow */ - handled = 1; - data.period = event->hw.last_period; + handled++; + perf_sample_data_init(&data, 0, event->hw.last_period); - if (!x86_perf_event_set_period(event, hwc, idx)) + if (!x86_perf_event_set_period(event)) continue; - if (perf_event_overflow(event, 1, &data, regs)) - amd_pmu_disable_event(hwc, idx); + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); } if (handled) @@ -1778,28 +1237,8 @@ static int amd_pmu_handle_irq(struct pt_ return handled; } -void smp_perf_pending_interrupt(struct pt_regs *regs) -{ - irq_enter(); - ack_APIC_irq(); - inc_irq_stat(apic_pending_irqs); - perf_event_do_pending(); - irq_exit(); -} - -void set_perf_event_pending(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - apic->send_IPI_self(LOCAL_PENDING_VECTOR); -#endif -} - void perf_events_lapic_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC if (!x86_pmu.apic || !x86_pmu_initialized()) return; @@ -1807,15 +1246,22 @@ void perf_events_lapic_init(void) * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif } +struct pmu_nmi_state { + unsigned int marked; + int handled; +}; + +static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi); + static int __kprobes perf_event_nmi_handler(struct notifier_block *self, unsigned long cmd, void *__args) { struct die_args *args = __args; - struct pt_regs *regs; + unsigned int this_nmi; + int handled; if (!atomic_read(&active_events)) return NOTIFY_DONE; @@ -1824,24 +1270,45 @@ perf_event_nmi_handler(struct notifier_b case DIE_NMI: case DIE_NMI_IPI: break; - + case DIE_NMIUNKNOWN: + this_nmi = percpu_read(irq_stat.__nmi_count); + if (this_nmi != percpu_read(pmu_nmi.marked)) + /* let the kernel handle the unknown nmi */ + return NOTIFY_DONE; + /* + * This one is a PMU back-to-back nmi. Two events + * trigger 'simultaneously' raising two back-to-back + * NMIs. If the first NMI handles both, the latter + * will be empty and daze the CPU. So, we drop it to + * avoid false-positive 'unknown nmi' messages. + */ + return NOTIFY_STOP; default: return NOTIFY_DONE; } - regs = args->regs; + handled = x86_pmu.handle_irq(args->regs); + if (!handled) + return NOTIFY_DONE; -#ifdef CONFIG_X86_LOCAL_APIC - apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif - /* - * Can't rely on the handled return value to say it was our NMI, two - * events could trigger 'simultaneously' raising two back-to-back NMIs. - * - * If the first NMI handles both, the latter will be empty and daze - * the CPU. - */ - x86_pmu.handle_irq(regs); + this_nmi = percpu_read(irq_stat.__nmi_count); + if ((handled > 1) || + /* the next nmi could be a back-to-back nmi */ + ((percpu_read(pmu_nmi.marked) == this_nmi) && + (percpu_read(pmu_nmi.handled) > 1))) { + /* + * We could have two subsequent back-to-back nmis: The + * first handles more than one counter, the 2nd + * handles only one counter and the 3rd handles no + * counter. + * + * This is the 2nd nmi because the previous was + * handling more than one counter. We will mark the + * next (3rd) and then drop it if unhandled. + */ + percpu_write(pmu_nmi.marked, this_nmi + 1); + percpu_write(pmu_nmi.handled, handled); + } return NOTIFY_STOP; } @@ -1849,199 +1316,211 @@ perf_event_nmi_handler(struct notifier_b static __read_mostly struct notifier_block perf_event_nmi_notifier = { .notifier_call = perf_event_nmi_handler, .next = NULL, - .priority = 1 + .priority = 1, }; -static struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = p6_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_event, - .disable = p6_pmu_disable_event, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_events = 2, - /* - * Events have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a event for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .event_bits = 32, - .event_mask = (1ULL << 32) - 1, -}; +struct event_constraint unconstrained; +struct event_constraint emptyconstraint; -static struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_event, - .disable = intel_pmu_disable_event, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic event period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, -}; +static int __cpuinit +x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int ret = NOTIFY_OK; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + cpuc->kfree_on_online = NULL; + if (x86_pmu.cpu_prepare) + ret = x86_pmu.cpu_prepare(cpu); + break; -static struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_event, - .disable = amd_pmu_disable_event, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, -}; + case CPU_STARTING: + if (x86_pmu.attr_rdpmc) + set_in_cr4(X86_CR4_PCE); + if (x86_pmu.cpu_starting) + x86_pmu.cpu_starting(cpu); + break; -static int p6_pmu_init(void) -{ - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ + case CPU_ONLINE: + kfree(cpuc->kfree_on_online); + break; + + case CPU_DYING: + if (x86_pmu.cpu_dying) + x86_pmu.cpu_dying(cpu); break; - case 9: - case 13: - /* Pentium M */ + + case CPU_UP_CANCELED: + case CPU_DEAD: + if (x86_pmu.cpu_dead) + x86_pmu.cpu_dead(cpu); break; + default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; + break; } - x86_pmu = p6_pmu; + return ret; +} - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } +static void __init pmu_check_apic(void) +{ + if (cpu_has_apic) + return; - return 0; + x86_pmu.apic = 0; + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); } -static int intel_pmu_init(void) +static struct attribute_group x86_pmu_format_group = { + .name = "format", + .attrs = NULL, +}; + +/* + * Remove all undefined events (x86_pmu.event_map(id) == 0) + * out of events_attr attributes. + */ +static void __init filter_events(struct attribute **attrs) { - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } + struct device_attribute *d; + struct perf_pmu_events_attr *pmu_attr; + int i, j; + + for (i = 0; attrs[i]; i++) { + d = (struct device_attribute *)attrs[i]; + pmu_attr = container_of(d, struct perf_pmu_events_attr, attr); + /* str trumps id */ + if (pmu_attr->event_str) + continue; + if (x86_pmu.event_map(i)) + continue; + + for (j = i; attrs[j]; j++) + attrs[j] = attrs[j + 1]; + + /* Check the shifted attr. */ + i--; } +} - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; +/* Merge two pointer arrays */ +static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b) +{ + struct attribute **new; + int j, i; + + for (j = 0; a[j]; j++) + ; + for (i = 0; b[i]; i++) + j++; + j++; + + new = kmalloc(sizeof(struct attribute *) * j, GFP_KERNEL); + if (!new) + return NULL; + + j = 0; + for (i = 0; a[i]; i++) + new[j++] = a[i]; + for (i = 0; b[i]; i++) + new[j++] = b[i]; + new[j] = NULL; + + return new; +} + +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page) +{ + struct perf_pmu_events_attr *pmu_attr = \ + container_of(attr, struct perf_pmu_events_attr, attr); + u64 config = x86_pmu.event_map(pmu_attr->id); + + /* string trumps id */ + if (pmu_attr->event_str) + return sprintf(page, "%s", pmu_attr->event_str); + + return x86_pmu.events_sysfs_show(page, config); +} + +EVENT_ATTR(cpu-cycles, CPU_CYCLES ); +EVENT_ATTR(instructions, INSTRUCTIONS ); +EVENT_ATTR(cache-references, CACHE_REFERENCES ); +EVENT_ATTR(cache-misses, CACHE_MISSES ); +EVENT_ATTR(branch-instructions, BRANCH_INSTRUCTIONS ); +EVENT_ATTR(branch-misses, BRANCH_MISSES ); +EVENT_ATTR(bus-cycles, BUS_CYCLES ); +EVENT_ATTR(stalled-cycles-frontend, STALLED_CYCLES_FRONTEND ); +EVENT_ATTR(stalled-cycles-backend, STALLED_CYCLES_BACKEND ); +EVENT_ATTR(ref-cycles, REF_CPU_CYCLES ); + +static struct attribute *empty_attrs; + +static struct attribute *events_attr[] = { + EVENT_PTR(CPU_CYCLES), + EVENT_PTR(INSTRUCTIONS), + EVENT_PTR(CACHE_REFERENCES), + EVENT_PTR(CACHE_MISSES), + EVENT_PTR(BRANCH_INSTRUCTIONS), + EVENT_PTR(BRANCH_MISSES), + EVENT_PTR(BUS_CYCLES), + EVENT_PTR(STALLED_CYCLES_FRONTEND), + EVENT_PTR(STALLED_CYCLES_BACKEND), + EVENT_PTR(REF_CPU_CYCLES), + NULL, +}; - version = eax.split.version_id; - if (version < 2) - return -ENODEV; +static struct attribute_group x86_pmu_events_group = { + .name = "events", + .attrs = events_attr, +}; - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_events = eax.split.num_events; - x86_pmu.event_bits = eax.split.bit_width; - x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) +{ + u64 umask = (config & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + u64 cmask = (config & ARCH_PERFMON_EVENTSEL_CMASK) >> 24; + bool edge = (config & ARCH_PERFMON_EVENTSEL_EDGE); + bool pc = (config & ARCH_PERFMON_EVENTSEL_PIN_CONTROL); + bool any = (config & ARCH_PERFMON_EVENTSEL_ANY); + bool inv = (config & ARCH_PERFMON_EVENTSEL_INV); + ssize_t ret; /* - * Quirk: v2 perfmon does not report fixed-purpose events, so - * assume at least 3 events: - */ - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + * We have whole page size to spend and just little data + * to write, so we can safely use sprintf. + */ + ret = sprintf(page, "event=0x%02llx", event); - /* - * Install the hw-cache-events table: - */ - switch (boot_cpu_data.x86_model) { - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + if (umask) + ret += sprintf(page + ret, ",umask=0x%02llx", umask); - pr_cont("Core2 events, "); - break; - default: - case 26: - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + if (edge) + ret += sprintf(page + ret, ",edge"); - pr_cont("Nehalem/Corei7 events, "); - break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + if (pc) + ret += sprintf(page + ret, ",pc"); - pr_cont("Atom events, "); - break; - } - return 0; -} + if (any) + ret += sprintf(page + ret, ",any"); -static int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; + if (inv) + ret += sprintf(page + ret, ",inv"); - x86_pmu = amd_pmu; + if (cmask) + ret += sprintf(page + ret, ",cmask=0x%02llx", cmask); - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + ret += sprintf(page + ret, "\n"); - return 0; + return ret; } -void __init init_hw_perf_events(void) +static int __init init_hw_perf_events(void) { + struct x86_pmu_quirk *quirk; int err; pr_info("Performance Events: "); @@ -2054,103 +1533,383 @@ void __init init_hw_perf_events(void) err = amd_pmu_init(); break; default: - return; + return 0; } if (err != 0) { pr_cont("no PMU driver, software events only.\n"); - return; + return 0; } + pmu_check_apic(); + + /* sanity check that the hardware exists or is emulated */ + if (!check_hw_exists()) + return 0; + pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_events, X86_PMC_MAX_GENERIC); - x86_pmu.num_events = X86_PMC_MAX_GENERIC; - } - perf_event_mask = (1 << x86_pmu.num_events) - 1; - perf_max_events = x86_pmu.num_events; - - if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; - } - - perf_event_mask |= - ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_event_mask; + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) + quirk->func(); + + if (!x86_pmu.intel_ctrl) + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); + unconstrained = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, + 0, x86_pmu.num_counters, 0, 0); + + x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ + x86_pmu_format_group.attrs = x86_pmu.format_attrs; + + if (x86_pmu.event_attrs) + x86_pmu_events_group.attrs = x86_pmu.event_attrs; + + if (!x86_pmu.events_sysfs_show) + x86_pmu_events_group.attrs = &empty_attrs; + else + filter_events(x86_pmu_events_group.attrs); + + if (x86_pmu.cpu_events) { + struct attribute **tmp; + + tmp = merge_attr(x86_pmu_events_group.attrs, x86_pmu.cpu_events); + if (!WARN_ON(!tmp)) + x86_pmu_events_group.attrs = tmp; + } + pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.event_bits); - pr_info("... generic registers: %d\n", x86_pmu.num_events); - pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); - pr_info("... event mask: %016Lx\n", perf_event_mask); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); + pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); + + perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW); + perf_cpu_notifier(x86_pmu_notifier); + + return 0; } +early_initcall(init_hw_perf_events); static inline void x86_pmu_read(struct perf_event *event) { - x86_perf_event_update(event, &event->hw, event->hw.idx); + x86_perf_event_update(event); } -static const struct pmu pmu = { - .enable = x86_pmu_enable, - .disable = x86_pmu_disable, - .read = x86_pmu_read, - .unthrottle = x86_pmu_unthrottle, -}; +/* + * Start group events scheduling transaction + * Set the flag to make pmu::enable() not perform the + * schedulability test, it will be performed at commit time + */ +static void x86_pmu_start_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); -const struct pmu *hw_perf_event_init(struct perf_event *event) + perf_pmu_disable(pmu); + cpuc->group_flag |= PERF_EVENT_TXN; + cpuc->n_txn = 0; +} + +/* + * Stop group events scheduling transaction + * Clear the flag and pmu::enable() will perform the + * schedulability test. + */ +static void x86_pmu_cancel_txn(struct pmu *pmu) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + cpuc->group_flag &= ~PERF_EVENT_TXN; + /* + * Truncate the collected events. + */ + cpuc->n_added -= cpuc->n_txn; + cpuc->n_events -= cpuc->n_txn; + perf_pmu_enable(pmu); +} + +/* + * Commit group events scheduling transaction + * Perform the group schedulability test as a whole + * Return 0 if success + */ +static int x86_pmu_commit_txn(struct pmu *pmu) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int assign[X86_PMC_IDX_MAX]; + int n, ret; + + n = cpuc->n_events; + + if (!x86_pmu_initialized()) + return -EAGAIN; + + ret = x86_pmu.schedule_events(cpuc, n, assign); + if (ret) + return ret; + + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n*sizeof(int)); + + cpuc->group_flag &= ~PERF_EVENT_TXN; + perf_pmu_enable(pmu); + return 0; +} +/* + * a fake_cpuc is used to validate event groups. Due to + * the extra reg logic, we need to also allocate a fake + * per_core and per_cpu structure. Otherwise, group events + * using extra reg may conflict without the kernel being + * able to catch this when the last event gets added to + * the group. + */ +static void free_fake_cpuc(struct cpu_hw_events *cpuc) +{ + kfree(cpuc->shared_regs); + kfree(cpuc); +} + +static struct cpu_hw_events *allocate_fake_cpuc(void) +{ + struct cpu_hw_events *cpuc; + int cpu = raw_smp_processor_id(); + + cpuc = kzalloc(sizeof(*cpuc), GFP_KERNEL); + if (!cpuc) + return ERR_PTR(-ENOMEM); + + /* only needed, if we have extra_regs */ + if (x86_pmu.extra_regs) { + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + goto error; + } + cpuc->is_fake = 1; + return cpuc; +error: + free_fake_cpuc(cpuc); + return ERR_PTR(-ENOMEM); +} + +/* + * validate that we can schedule this event + */ +static int validate_event(struct perf_event *event) +{ + struct cpu_hw_events *fake_cpuc; + struct event_constraint *c; + int ret = 0; + + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); + + c = x86_pmu.get_event_constraints(fake_cpuc, event); + + if (!c || !c->weight) + ret = -ENOSPC; + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(fake_cpuc, event); + + free_fake_cpuc(fake_cpuc); + + return ret; +} + +/* + * validate a single event group + * + * validation include: + * - check events are compatible which each other + * - events do not compete for the same counter + * - number of events <= number of counters + * + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int validate_group(struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct cpu_hw_events *fake_cpuc; + int ret = -ENOSPC, n; + + fake_cpuc = allocate_fake_cpuc(); + if (IS_ERR(fake_cpuc)) + return PTR_ERR(fake_cpuc); + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = collect_events(fake_cpuc, leader, true); + if (n < 0) + goto out; + + fake_cpuc->n_events = n; + n = collect_events(fake_cpuc, event, false); + if (n < 0) + goto out; + + fake_cpuc->n_events = n; + + ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); + +out: + free_fake_cpuc(fake_cpuc); + return ret; +} + +static int x86_pmu_event_init(struct perf_event *event) +{ + struct pmu *tmp; int err; - err = __hw_perf_event_init(event); + switch (event->attr.type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + break; + + default: + return -ENOENT; + } + + err = __x86_pmu_event_init(event); + if (!err) { + /* + * we temporarily connect event to its pmu + * such that validate_group() can classify + * it as an x86 event using is_x86_event() + */ + tmp = event->pmu; + event->pmu = &pmu; + + if (event->group_leader != event) + err = validate_group(event); + else + err = validate_event(event); + + event->pmu = tmp; + } if (err) { if (event->destroy) event->destroy(event); - return ERR_PTR(err); } - return &pmu; + return err; } -/* - * callchain support - */ +static ssize_t get_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", x86_pmu.attr_rdpmc); +} -static inline -void callchain_store(struct perf_callchain_entry *entry, u64 ip) +static void change_rdpmc(void *info) { - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; + bool enable = !!(unsigned long)info; + + if (enable) + set_in_cr4(X86_CR4_PCE); + else + clear_in_cr4(X86_CR4_PCE); } -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); +static ssize_t set_attr_rdpmc(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val = simple_strtoul(buf, NULL, 0); + if (!!val != !!x86_pmu.attr_rdpmc) { + x86_pmu.attr_rdpmc = !!val; + smp_call_function(change_rdpmc, (void *)val, 1); + } -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) + return count; +} + +static DEVICE_ATTR(rdpmc, S_IRUSR | S_IWUSR, get_attr_rdpmc, set_attr_rdpmc); + +static struct attribute *x86_pmu_attrs[] = { + &dev_attr_rdpmc.attr, + NULL, +}; + +static struct attribute_group x86_pmu_attr_group = { + .attrs = x86_pmu_attrs, +}; + +static const struct attribute_group *x86_pmu_attr_groups[] = { + &x86_pmu_attr_group, + &x86_pmu_format_group, + &x86_pmu_events_group, + NULL, +}; + +static int x86_pmu_event_idx(struct perf_event *event) { - /* Ignore warnings */ + int idx = event->hw.idx; + + if (x86_pmu.num_counters_fixed && idx >= INTEL_PMC_IDX_FIXED) { + idx -= INTEL_PMC_IDX_FIXED; + idx |= 1 << 30; + } + + return idx + 1; } -static void backtrace_warning(void *data, char *msg) +static void x86_pmu_flush_branch_stack(void) { - /* Ignore warnings */ + if (x86_pmu.flush_branch_stack) + x86_pmu.flush_branch_stack(); } -static int backtrace_stack(void *data, char *name) +void perf_check_microcode(void) { - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); + if (x86_pmu.check_microcode) + x86_pmu.check_microcode(); +} +EXPORT_SYMBOL_GPL(perf_check_microcode); + +static struct pmu pmu = { + .pmu_enable = x86_pmu_enable, + .pmu_disable = x86_pmu_disable, + + .attr_groups = x86_pmu_attr_groups, + + .event_init = x86_pmu_event_init, + + .add = x86_pmu_add, + .del = x86_pmu_del, + .start = x86_pmu_start, + .stop = x86_pmu_stop, + .read = x86_pmu_read, + + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, + + .event_idx = x86_pmu_event_idx, + .flush_branch_stack = x86_pmu_flush_branch_stack, +}; + +/* + * callchain support + */ +static int backtrace_stack(void *data, char *name) +{ return 0; } @@ -2158,144 +1917,144 @@ static void backtrace_address(void *data { struct perf_callchain_entry *entry = data; - if (per_cpu(in_nmi_frame, smp_processor_id())) - return; - - if (reliable) - callchain_store(entry, addr); + perf_callchain_store(entry, addr); } static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, .stack = backtrace_stack, .address = backtrace_address, + .walk_stack = print_context_stack_bp, }; -#include "../dumpstack.h" - -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs) { - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->ip); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + perf_callchain_store(entry, regs->ip); + + dump_trace(NULL, regs, NULL, &backtrace_ops, entry); } -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +#ifdef CONFIG_COMPAT +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; + /* 32-bit process in 64-bit kernel. */ + struct stack_frame_ia32 frame; + const void __user *fp; - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); + if (!test_thread_flag(TIF_IA32)) + return 0; - map = kmap_atomic(page, type); - memcpy(to, map+offset, size); - kunmap_atomic(map, type); - put_page(page); + fp = compat_ptr(regs->bp); + while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; + frame.next_frame = 0; + frame.return_address = 0; - len += size; - to += size; - addr += size; + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) + break; - } while (len < n); + if (fp < compat_ptr(regs->sp)) + break; - return len; + perf_callchain_store(entry, frame.return_address); + fp = compat_ptr(frame.next_frame); + } + return 1; } - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +#else +static inline int +perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) { - unsigned long bytes; - - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); - - return bytes == sizeof(*frame); + return 0; } +#endif -static void -perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +void +perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) { struct stack_frame frame; const void __user *fp; - if (!user_mode(regs)) - regs = task_pt_regs(current); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return; + } fp = (void __user *)regs->bp; - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->ip); + perf_callchain_store(entry, regs->ip); + + if (!current->mm) + return; + + if (perf_callchain_user32(regs, entry)) + return; while (entry->nr < PERF_MAX_STACK_DEPTH) { + unsigned long bytes; frame.next_frame = NULL; frame.return_address = 0; - if (!copy_stack_frame(fp, &frame)) + bytes = copy_from_user_nmi(&frame, fp, sizeof(frame)); + if (bytes != sizeof(frame)) break; if ((unsigned long)fp < regs->sp) break; - callchain_store(entry, frame.return_address); + perf_callchain_store(entry, frame.return_address); fp = frame.next_frame; } } -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +unsigned long perf_instruction_pointer(struct pt_regs *regs) { - int is_user; + unsigned long ip; - if (!regs) - return; - - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) - perf_callchain_kernel(regs, entry); + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + ip = perf_guest_cbs->get_guest_ip(); + else + ip = instruction_pointer(regs); - if (current->mm) - perf_callchain_user(regs, entry); + return ip; } -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +unsigned long perf_misc_flags(struct pt_regs *regs) { - struct perf_callchain_entry *entry; - - if (in_nmi()) - entry = &__get_cpu_var(pmc_nmi_entry); - else - entry = &__get_cpu_var(pmc_irq_entry); + int misc = 0; - entry->nr = 0; + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_cbs->is_user_mode()) + misc |= PERF_RECORD_MISC_GUEST_USER; + else + misc |= PERF_RECORD_MISC_GUEST_KERNEL; + } else { + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; + } - perf_do_callchain(regs, entry); + if (regs->flags & PERF_EFLAGS_EXACT) + misc |= PERF_RECORD_MISC_EXACT_IP; - return entry; + return misc; } -void hw_perf_event_setup_online(int cpu) +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) { - init_debug_store_on_cpu(cpu); + cap->version = x86_pmu.version; + cap->num_counters_gp = x86_pmu.num_counters; + cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->bit_width_gp = cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->events_mask = (unsigned int)x86_pmu.events_maskl; + cap->events_mask_len = x86_pmu.events_mask_len; } +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event.h linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event.h --- linux-2.6.32/arch/x86/kernel/cpu/perf_event.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event.h 2015-03-10 13:24:14.000000000 -0700 @@ -0,0 +1,692 @@ +/* + * Performance events x86 architecture code + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2009 Intel Corporation, + * Copyright (C) 2009 Google, Inc., Stephane Eranian + * + * For licencing details see kernel-base/COPYING + */ + +#include + +/* + * | NHM/WSM | SNB | + * register ------------------------------- + * | HT | no HT | HT | no HT | + *----------------------------------------- + * offcore | core | core | cpu | core | + * lbr_sel | core | core | cpu | core | + * ld_lat | cpu | core | cpu | core | + *----------------------------------------- + * + * Given that there is a small number of shared regs, + * we can pre-allocate their slot in the per-cpu + * per-core reg tables. + */ +enum extra_reg_type { + EXTRA_REG_NONE = -1, /* not used */ + + EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ + EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ + EXTRA_REG_LBR = 2, /* lbr_select */ + EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ + + EXTRA_REG_MAX /* number of entries needed */ +}; + +struct event_constraint { + union { + unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 idxmsk64; + }; + u64 code; + u64 cmask; + int weight; + int overlap; + int flags; +}; +/* + * struct event_constraint flags + */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ +#define PERF_X86_EVENT_PEBS_ST_HSW 0x4 /* haswell style st data sampling */ + +struct amd_nb { + int nb_id; /* NorthBridge id */ + int refcnt; /* reference count */ + struct perf_event *owners[X86_PMC_IDX_MAX]; + struct event_constraint event_constraints[X86_PMC_IDX_MAX]; +}; + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 8 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +struct intel_percore; + +#define MAX_LBR_ENTRIES 16 + +struct cpu_hw_events { + /* + * Generic x86 PMC bits + */ + struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + int enabled; + + int n_events; + int n_added; + int n_txn; + int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ + u64 tags[X86_PMC_IDX_MAX]; + struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + + unsigned int group_flag; + int is_fake; + + /* + * Intel DebugStore bits + */ + struct debug_store *ds; + u64 pebs_enabled; + + /* + * Intel LBR bits + */ + int lbr_users; + void *lbr_context; + struct perf_branch_stack lbr_stack; + struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + struct er_account *lbr_sel; + u64 br_sel; + + /* + * Intel host/guest exclude bits + */ + u64 intel_ctrl_guest_mask; + u64 intel_ctrl_host_mask; + struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; + + /* + * Intel checkpoint mask + */ + u64 intel_cp_status; + + /* + * manage shared (per-core, per-cpu) registers + * used on Intel NHM/WSM/SNB + */ + struct intel_shared_regs *shared_regs; + + /* + * AMD specific bits + */ + struct amd_nb *amd_nb; + + void *kfree_on_online; + + /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ + u64 perf_ctr_virt_mask; +}; + +#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ + { .idxmsk64 = (n) }, \ + .code = (c), \ + .cmask = (m), \ + .weight = (w), \ + .overlap = (o), \ + .flags = f, \ +} + +#define EVENT_CONSTRAINT(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0) + +/* + * Constraint on the Event code. + */ +#define INTEL_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) + +/* + * Constraint on the Event code + UMask + fixed-mask + * + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * - in_tx + * - in_tx_checkpointed + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ +#define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED) +#define FIXED_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) + +/* + * Constraint on the Event code + UMask + */ +#define INTEL_UEVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) + +#define INTEL_PLD_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) + +#define INTEL_PST_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) + +/* DataLA version of store sampling without extra enable bit. */ +#define INTEL_PST_HSW_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) + +#define EVENT_CONSTRAINT_END \ + EVENT_CONSTRAINT(0, 0, 0) + +#define for_each_event_constraint(e, c) \ + for ((e) = (c); (e)->weight; (e)++) + +/* + * Per register state. + */ +struct er_account { + spinlock_t lock; /* per-core: protect structure */ + u64 config; /* extra MSR config */ + u64 reg; /* extra MSR number */ + atomic_t ref; /* reference count */ +}; + +/* + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. + */ +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ +}; + +/* + * Extra registers for specific events. + * + * Some events need large masks and require external MSRs. + * Those extra MSRs end up being shared for all events on + * a PMU and sometimes between PMU of sibling HT threads. + * In either case, the kernel needs to handle conflicting + * accesses to those extra, shared, regs. The data structure + * to manage those registers is stored in cpu_hw_event. + */ +struct extra_reg { + unsigned int event; + unsigned int msr; + u64 config_mask; + u64 valid_mask; + int idx; /* per_xxx->regs[] reg index */ +}; + +#define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ + .event = (e), \ + .msr = (ms), \ + .config_mask = (m), \ + .valid_mask = (vm), \ + .idx = EXTRA_REG_##i, \ + } + +#define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) + +#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \ + ARCH_PERFMON_EVENTSEL_UMASK, vm, idx) + +#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \ + INTEL_UEVENT_EXTRA_REG(c, \ + MSR_PEBS_LD_LAT_THRESHOLD, \ + 0xffff, \ + LDLAT) + +#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) + +union perf_capabilities { + struct { + u64 lbr_format : 6; + u64 pebs_trap : 1; + u64 pebs_arch_reg : 1; + u64 pebs_format : 4; + u64 smm_freeze : 1; + /* + * PMU supports separate counter range for writing + * values > 32bit. + */ + u64 full_width_write:1; + }; + u64 capabilities; +}; + +struct x86_pmu_quirk { + struct x86_pmu_quirk *next; + void (*func)(void); +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + /* + * Generic x86 PMC bits + */ + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(int added); + void (*enable)(struct perf_event *); + void (*disable)(struct perf_event *); + int (*hw_config)(struct perf_event *event); + int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); + unsigned eventsel; + unsigned perfctr; + int (*addr_offset)(int index, bool eventsel); + int (*rdpmc_index)(int index); + u64 (*event_map)(int); + int max_events; + int num_counters; + int num_counters_fixed; + int cntval_bits; + u64 cntval_mask; + union { + unsigned long events_maskl; + unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; + }; + int events_mask_len; + int apic; + u64 max_period; + struct event_constraint * + (*get_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + + void (*put_event_constraints)(struct cpu_hw_events *cpuc, + struct perf_event *event); + struct event_constraint *event_constraints; + struct x86_pmu_quirk *quirks; + int perfctr_second_write; + bool late_ack; + + /* + * sysfs attrs + */ + int attr_rdpmc; + struct attribute **format_attrs; + struct attribute **event_attrs; + + ssize_t (*events_sysfs_show)(char *page, u64 config); + struct attribute **cpu_events; + + /* + * CPU Hotplug hooks + */ + int (*cpu_prepare)(int cpu); + void (*cpu_starting)(int cpu); + void (*cpu_dying)(int cpu); + void (*cpu_dead)(int cpu); + + void (*check_microcode)(void); + void (*flush_branch_stack)(void); + + /* + * Intel Arch Perfmon v2+ + */ + u64 intel_ctrl; + union perf_capabilities intel_cap; + + /* + * Intel DebugStore bits + */ + int bts, pebs; + int bts_active, pebs_active, pebs_broken; + int pebs_record_size; + void (*drain_pebs)(struct pt_regs *regs); + struct event_constraint *pebs_constraints; + void (*pebs_aliases)(struct perf_event *event); + int max_pebs_events; + + /* + * Intel LBR + */ + unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ + int lbr_nr; /* hardware stack size */ + u64 lbr_sel_mask; /* LBR_SELECT valid bits */ + const int *lbr_sel_map; /* lbr_select mappings */ + bool lbr_double_abort; /* duplicated lbr aborts */ + + /* + * Extra registers for events + */ + struct extra_reg *extra_regs; + unsigned int er_flags; + /* + * Intel host/guest support (KVM) + */ + struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); +}; + +#define ERF_NO_HT_SHARING 1 +#define ERF_HAS_RSP_1 2 + +union x86_pmu_config { + struct { + u64 event:8, + umask:8, + usr:1, + os:1, + edge:1, + pc:1, + interrupt:1, + __reserved1:1, + en:1, + inv:1, + cmask:8, + event2:4, + __reserved2:4, + go:1, + ho:1; + } bits; + u64 value; +}; + +#define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value + +#define x86_add_quirk(func_) \ +do { \ + static struct x86_pmu_quirk __quirk __initdata = { \ + .func = func_, \ + }; \ + __quirk.next = x86_pmu.quirks; \ + x86_pmu.quirks = &__quirk; \ +} while (0) + +#define EVENT_VAR(_id) event_attr_##_id +#define EVENT_PTR(_id) &event_attr_##_id.attr.attr + +#define EVENT_ATTR(_name, _id) \ +static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = PERF_COUNT_HW_##_id, \ + .event_str = NULL, \ +}; + +#define EVENT_ATTR_STR(_name, v, str) \ +static struct perf_pmu_events_attr event_attr_##v = { \ + .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ + .id = 0, \ + .event_str = str, \ +}; + +extern struct x86_pmu x86_pmu __read_mostly; + +DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +int x86_perf_event_set_period(struct perf_event *event); + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +extern u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +extern u64 __read_mostly hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +u64 x86_perf_event_update(struct perf_event *event); + +static inline unsigned int x86_pmu_config_addr(int index) +{ + return x86_pmu.eventsel + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, true) : index); +} + +static inline unsigned int x86_pmu_event_addr(int index) +{ + return x86_pmu.perfctr + (x86_pmu.addr_offset ? + x86_pmu.addr_offset(index, false) : index); +} + +static inline int x86_pmu_rdpmc_index(int index) +{ + return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; +} + +int x86_setup_perfctr(struct perf_event *event); + +int x86_pmu_hw_config(struct perf_event *event); + +void x86_pmu_disable_all(void); + +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) +{ + u64 disable_mask = __get_cpu_var(cpu_hw_events).perf_ctr_virt_mask; + + if (hwc->extra_reg.reg) + wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); + wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); +} + +void x86_pmu_enable_all(int added); + +int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign); +int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); + +void x86_pmu_stop(struct perf_event *event, int flags); + +static inline void x86_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config); +} + +void x86_pmu_enable_event(struct perf_event *event); + +int x86_pmu_handle_irq(struct pt_regs *regs); + +extern struct event_constraint emptyconstraint; + +extern struct event_constraint unconstrained; + +extern unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n); + +static inline bool kernel_ip(unsigned long ip) +{ +#ifdef CONFIG_X86_32 + return ip > PAGE_OFFSET; +#else + return (long)ip < 0; +#endif +} + +ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); +ssize_t intel_event_sysfs_show(char *page, u64 config); + +#ifdef CONFIG_CPU_SUP_AMD + +int amd_pmu_init(void); + +#else /* CONFIG_CPU_SUP_AMD */ + +static inline int amd_pmu_init(void) +{ + return 0; +} + +#endif /* CONFIG_CPU_SUP_AMD */ + +#ifdef CONFIG_CPU_SUP_INTEL + +int intel_pmu_save_and_restart(struct perf_event *event); + +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event); + +struct intel_shared_regs *allocate_shared_regs(int cpu); + +int intel_pmu_init(void); + +void init_debug_store_on_cpu(int cpu); + +void fini_debug_store_on_cpu(int cpu); + +void release_ds_buffers(void); + +void reserve_ds_buffers(void); + +extern struct event_constraint bts_constraint; + +void intel_pmu_enable_bts(u64 config); + +void intel_pmu_disable_bts(void); + +int intel_pmu_drain_bts_buffer(void); + +extern struct event_constraint intel_core2_pebs_event_constraints[]; + +extern struct event_constraint intel_atom_pebs_event_constraints[]; + +extern struct event_constraint intel_slm_pebs_event_constraints[]; + +extern struct event_constraint intel_nehalem_pebs_event_constraints[]; + +extern struct event_constraint intel_westmere_pebs_event_constraints[]; + +extern struct event_constraint intel_snb_pebs_event_constraints[]; + +extern struct event_constraint intel_ivb_pebs_event_constraints[]; + +extern struct event_constraint intel_hsw_pebs_event_constraints[]; + +struct event_constraint *intel_pebs_constraints(struct perf_event *event); + +void intel_pmu_pebs_enable(struct perf_event *event); + +void intel_pmu_pebs_disable(struct perf_event *event); + +void intel_pmu_pebs_enable_all(void); + +void intel_pmu_pebs_disable_all(void); + +void intel_ds_init(void); + +void intel_pmu_lbr_reset(void); + +void intel_pmu_lbr_enable(struct perf_event *event); + +void intel_pmu_lbr_disable(struct perf_event *event); + +void intel_pmu_lbr_enable_all(void); + +void intel_pmu_lbr_disable_all(void); + +void intel_pmu_lbr_read(void); + +void intel_pmu_lbr_init_core(void); + +void intel_pmu_lbr_init_nhm(void); + +void intel_pmu_lbr_init_atom(void); + +void intel_pmu_lbr_init_snb(void); + +int intel_pmu_setup_lbr_filter(struct perf_event *event); + +int p4_pmu_init(void); + +int p6_pmu_init(void); + +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page); + +#else /* CONFIG_CPU_SUP_INTEL */ + +static inline void reserve_ds_buffers(void) +{ +} + +static inline void release_ds_buffers(void) +{ +} + +static inline int intel_pmu_init(void) +{ + return 0; +} + +static inline struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel.c 2015-03-10 13:24:14.000000000 -0700 @@ -0,0 +1,2541 @@ +#include +#include +#include +#include + +#include +#include +#include + +#include "perf_event.h" + +/* + * Intel PerfMon, used on Core and later. + */ +static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ +}; + +static struct event_constraint intel_core_event_constraints[] __read_mostly = +{ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_core2_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */ + INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */ + INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ + INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ + INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ + INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */ + INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_westmere_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ + INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ + INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_snb_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_westmere_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), + EVENT_EXTRA_END +}; + +static struct event_constraint intel_v1_event_constraints[] __read_mostly = +{ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_gen_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_slm_event_constraints[] __read_mostly = +{ + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x013c, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* pseudo CPU_CLK_UNHALTED.REF */ + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_snb_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3f807f8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + EVENT_EXTRA_END +}; + +static struct extra_reg intel_snbep_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + EVENT_EXTRA_END +}; + +EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); +EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); + +struct attribute *nhm_events_attrs[] = { + EVENT_PTR(mem_ld_nhm), + NULL, +}; + +struct attribute *snb_events_attrs[] = { + EVENT_PTR(mem_ld_snb), + EVENT_PTR(mem_st_snb), + NULL, +}; + +static struct event_constraint intel_hsw_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.* */ + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ + INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x08a3, 0x4), + /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */ + INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4), + /* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */ + INTEL_EVENT_CONSTRAINT(0x04a3, 0xf), + EVENT_CONSTRAINT_END +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +#define SNB_DMND_DATA_RD (1ULL << 0) +#define SNB_DMND_RFO (1ULL << 1) +#define SNB_DMND_IFETCH (1ULL << 2) +#define SNB_DMND_WB (1ULL << 3) +#define SNB_PF_DATA_RD (1ULL << 4) +#define SNB_PF_RFO (1ULL << 5) +#define SNB_PF_IFETCH (1ULL << 6) +#define SNB_LLC_DATA_RD (1ULL << 7) +#define SNB_LLC_RFO (1ULL << 8) +#define SNB_LLC_IFETCH (1ULL << 9) +#define SNB_BUS_LOCKS (1ULL << 10) +#define SNB_STRM_ST (1ULL << 11) +#define SNB_OTHER (1ULL << 15) +#define SNB_RESP_ANY (1ULL << 16) +#define SNB_NO_SUPP (1ULL << 17) +#define SNB_LLC_HITM (1ULL << 18) +#define SNB_LLC_HITE (1ULL << 19) +#define SNB_LLC_HITS (1ULL << 20) +#define SNB_LLC_HITF (1ULL << 21) +#define SNB_LOCAL (1ULL << 22) +#define SNB_REMOTE (0xffULL << 23) +#define SNB_SNP_NONE (1ULL << 31) +#define SNB_SNP_NOT_NEEDED (1ULL << 32) +#define SNB_SNP_MISS (1ULL << 33) +#define SNB_NO_FWD (1ULL << 34) +#define SNB_SNP_FWD (1ULL << 35) +#define SNB_HITM (1ULL << 36) +#define SNB_NON_DRAM (1ULL << 37) + +#define SNB_DMND_READ (SNB_DMND_DATA_RD|SNB_LLC_DATA_RD) +#define SNB_DMND_WRITE (SNB_DMND_RFO|SNB_LLC_RFO) +#define SNB_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SNB_SNP_ANY (SNB_SNP_NONE|SNB_SNP_NOT_NEEDED| \ + SNB_SNP_MISS|SNB_NO_FWD|SNB_SNP_FWD| \ + SNB_HITM) + +#define SNB_DRAM_ANY (SNB_LOCAL|SNB_REMOTE|SNB_SNP_ANY) +#define SNB_DRAM_REMOTE (SNB_REMOTE|SNB_SNP_ANY) + +#define SNB_L3_ACCESS SNB_RESP_ANY +#define SNB_L3_MISS (SNB_DRAM_ANY|SNB_NON_DRAM) + +static __initconst const u64 snb_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_L3_ACCESS, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_L3_MISS, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_READ|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_READ|SNB_DRAM_REMOTE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_WRITE|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_WRITE|SNB_DRAM_REMOTE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SNB_DMND_PREFETCH|SNB_DRAM_ANY, + [ C(RESULT_MISS) ] = SNB_DMND_PREFETCH|SNB_DRAM_REMOTE, + }, + }, +}; + +static __initconst const u64 snb_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */ + [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */ + [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + +}; + +static __initconst const u64 westmere_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, +}; + +/* + * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits; + * See IA32 SDM Vol 3B 30.6.1.3 + */ + +#define NHM_DMND_DATA_RD (1 << 0) +#define NHM_DMND_RFO (1 << 1) +#define NHM_DMND_IFETCH (1 << 2) +#define NHM_DMND_WB (1 << 3) +#define NHM_PF_DATA_RD (1 << 4) +#define NHM_PF_DATA_RFO (1 << 5) +#define NHM_PF_IFETCH (1 << 6) +#define NHM_OFFCORE_OTHER (1 << 7) +#define NHM_UNCORE_HIT (1 << 8) +#define NHM_OTHER_CORE_HIT_SNP (1 << 9) +#define NHM_OTHER_CORE_HITM (1 << 10) + /* reserved */ +#define NHM_REMOTE_CACHE_FWD (1 << 12) +#define NHM_REMOTE_DRAM (1 << 13) +#define NHM_LOCAL_DRAM (1 << 14) +#define NHM_NON_DRAM (1 << 15) + +#define NHM_LOCAL (NHM_LOCAL_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_REMOTE (NHM_REMOTE_DRAM) + +#define NHM_DMND_READ (NHM_DMND_DATA_RD) +#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB) +#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO) + +#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM) +#define NHM_L3_MISS (NHM_NON_DRAM|NHM_LOCAL_DRAM|NHM_REMOTE_DRAM|NHM_REMOTE_CACHE_FWD) +#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS) + +static __initconst const u64 nehalem_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_LOCAL|NHM_REMOTE, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE, + }, + }, +}; + +static __initconst const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */ + [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */ + [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + /* + * Use RFO, not WRITEBACK, because a write miss would typically occur + * on RFO. + */ + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, +}; + +static __initconst const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static __initconst const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static struct extra_reg intel_slm_extra_regs[] __read_mostly = +{ + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1), + EVENT_EXTRA_END +}; + +#define SLM_DMND_READ SNB_DMND_DATA_RD +#define SLM_DMND_WRITE SNB_DMND_RFO +#define SLM_DMND_PREFETCH (SNB_PF_DATA_RD|SNB_PF_RFO) + +#define SLM_SNP_ANY (SNB_SNP_NONE|SNB_SNP_MISS|SNB_NO_FWD|SNB_HITM) +#define SLM_LLC_ACCESS SNB_RESP_ANY +#define SLM_LLC_MISS (SLM_SNP_ANY|SNB_NON_DRAM) + +static __initconst const u64 slm_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_READ|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_READ|SLM_LLC_MISS, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_WRITE|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_WRITE|SLM_LLC_MISS, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = SLM_DMND_PREFETCH|SLM_LLC_ACCESS, + [ C(RESULT_MISS) ] = SLM_DMND_PREFETCH|SLM_LLC_MISS, + }, + }, +}; + +static __initconst const u64 slm_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0104, /* LD_DCU_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* ICACHE.ACCESSES */ + [ C(RESULT_MISS) ] = 0x0280, /* ICACGE.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */ + [ C(RESULT_ACCESS) ] = 0x01b7, + /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */ + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0x0804, /* LD_DTLB_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) +{ + /* user explicitly requested branch sampling */ + if (has_branch_stack(event)) + return true; + + /* implicit branch sampling to correct PEBS skid */ + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && + x86_pmu.intel_cap.pebs_format < 2) + return true; + + return false; +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); + + intel_pmu_pebs_disable_all(); + intel_pmu_lbr_disable_all(); +} + +static void intel_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + intel_pmu_pebs_enable_all(); + intel_pmu_lbr_enable_all(); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, + x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask); + + if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +/* + * Workaround for: + * Intel Errata AAK100 (model 26) + * Intel Errata AAP53 (model 30) + * Intel Errata BD53 (model 44) + * + * The official story: + * These chips need to be 'reset' when adding counters by programming the + * magic three (non-counting) events 0x4300B5, 0x4300D2, and 0x4300B1 either + * in sequence on the same PMC or on different PMCs. + * + * In practise it appears some of these events do in fact count, and + * we need to programm all 4 events. + */ +static void intel_pmu_nhm_workaround(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + static const unsigned long nhm_magic[4] = { + 0x4300B5, + 0x4300D2, + 0x4300B1, + 0x4300B1 + }; + struct perf_event *event; + int i; + + /* + * The Errata requires below steps: + * 1) Clear MSR_IA32_PEBS_ENABLE and MSR_CORE_PERF_GLOBAL_CTRL; + * 2) Configure 4 PERFEVTSELx with the magic events and clear + * the corresponding PMCx; + * 3) set bit0~bit3 of MSR_CORE_PERF_GLOBAL_CTRL; + * 4) Clear MSR_CORE_PERF_GLOBAL_CTRL; + * 5) Clear 4 pairs of ERFEVTSELx and PMCx; + */ + + /* + * The real steps we choose are a little different from above. + * A) To reduce MSR operations, we don't run step 1) as they + * are already cleared before this function is called; + * B) Call x86_perf_event_update to save PMCx before configuring + * PERFEVTSELx with magic number; + * C) With step 5), we do clear only when the PERFEVTSELx is + * not used currently. + * D) Call x86_perf_event_set_period to restore PMCx; + */ + + /* We always operate 4 pairs of PERF Counters */ + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + if (event) + x86_perf_event_update(event); + } + + for (i = 0; i < 4; i++) { + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, nhm_magic[i]); + wrmsrl(MSR_ARCH_PERFMON_PERFCTR0 + i, 0x0); + } + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0xf); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); + + for (i = 0; i < 4; i++) { + event = cpuc->events[i]; + + if (event) { + x86_perf_event_set_period(event); + __x86_pmu_enable_event(&event->hw, + ARCH_PERFMON_EVENTSEL_ENABLE); + } else + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + i, 0x0); + } +} + +static void intel_pmu_nhm_enable_all(int added) +{ + if (added) + intel_pmu_nhm_workaround(); + intel_pmu_enable_all(added); +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + wrmsrl(hwc->config_base, ctrl_val); +} + +static inline bool event_is_checkpointed(struct perf_event *event) +{ + return (event->hw.config & HSW_IN_TX_CHECKPOINTED) != 0; +} + +static void intel_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + intel_pmu_drain_bts_buffer(); + return; + } + + cpuc->intel_ctrl_guest_mask &= ~(1ull << hwc->idx); + cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); + cpuc->intel_cp_status &= ~(1ull << hwc->idx); + + /* + * must disable before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_disable(event); + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc); + return; + } + + x86_pmu_disable_event(event); + + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_disable(event); +} + +static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) +{ + int idx = hwc->idx - INTEL_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + + /* + * ANY bit is supported in v3 and up + */ + if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY) + bits |= 0x4; + + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (unlikely(hwc->idx == INTEL_PMC_IDX_FIXED_BTS)) { + if (!percpu_read(cpu_hw_events.enabled)) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + /* + * must enabled before any actual event + * because any event may be combined with LBR + */ + if (intel_pmu_needs_lbr_smpl(event)) + intel_pmu_lbr_enable(event); + + if (event->attr.exclude_host) + cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); + if (event->attr.exclude_guest) + cpuc->intel_ctrl_host_mask |= (1ull << hwc->idx); + + if (unlikely(event_is_checkpointed(event))) + cpuc->intel_cp_status |= (1ull << hwc->idx); + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc); + return; + } + + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_enable(event); + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +int intel_pmu_save_and_restart(struct perf_event *event) +{ + x86_perf_event_update(event); + /* + * For a checkpointed counter always reset back to 0. This + * avoids a situation where the counter overflows, aborts the + * transaction and is then set back to shortly before the + * overflow, and overflows and aborts again. + */ + if (unlikely(event_is_checkpointed(event))) { + /* No race with NMIs because the counter should not be armed */ + wrmsrl(event->hw.event_base, 0); + local64_set(&event->hw.prev_count, 0); + } + return x86_perf_event_set_period(event); +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = percpu_read(cpu_hw_events.ds); + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + checking_wrmsrl(x86_pmu_config_addr(idx), 0ull); + checking_wrmsrl(x86_pmu_event_addr(idx), 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 status; + int handled; + + cpuc = &__get_cpu_var(cpu_hw_events); + + /* + * No known reason to not always do late ACK, + * but just in case do it opt-in. + */ + if (!x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_pmu_disable_all(); + handled = intel_pmu_drain_bts_buffer(); + status = intel_pmu_get_status(); + if (!status) { + intel_pmu_enable_all(0); + return handled; + } + + loops = 0; +again: + intel_pmu_ack_status(status); + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + goto done; + } + + inc_irq_stat(apic_perf_irqs); + + intel_pmu_lbr_read(); + + /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + /* + * PEBS overflow sets bit 62 in the global status register + */ + if (__test_and_clear_bit(62, (unsigned long *)&status)) { + handled++; + x86_pmu.drain_pebs(regs); + } + + /* + * Checkpointed counters can lead to 'spurious' PMIs because the + * rollback caused by the PMI will have cleared the overflow status + * bit. Therefore always force probe these counters. + */ + status |= cpuc->intel_cp_status; + + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + handled++; + + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + +done: + intel_pmu_enable_all(0); + /* + * Only unmask the NMI after the overflow counters + * have been reset. This avoids spurious NMIs on + * Haswell CPUs. + */ + if (x86_pmu.late_ack) + apic_write(APIC_LVTPC, APIC_DM_NMI); + return handled; +} + +static struct event_constraint * +intel_bts_constraints(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + unsigned int hw_event, bts_event; + + if (event->attr.freq) + return NULL; + + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; + bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + + if (unlikely(hw_event == bts_event && hwc->sample_period == 1)) + return &bts_constraint; + + return NULL; +} + +static int intel_alt_er(int idx) +{ + if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + return idx; + + if (idx == EXTRA_REG_RSP_0) + return EXTRA_REG_RSP_1; + + if (idx == EXTRA_REG_RSP_1) + return EXTRA_REG_RSP_0; + + return idx; +} + +static void intel_fixup_er(struct perf_event *event, int idx) +{ + event->hw.extra_reg.idx = idx; + + if (idx == EXTRA_REG_RSP_0) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_0].event; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } else if (idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= x86_pmu.extra_regs[EXTRA_REG_RSP_1].event; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; + } +} + +/* + * manage allocation of shared extra msr for certain events + * + * sharing can be: + * per-cpu: to be shared between the various events on a single PMU + * per-core: per-cpu + shared by HT threads + */ +static struct event_constraint * +__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event, + struct hw_perf_event_extra *reg) +{ + struct event_constraint *c = &emptyconstraint; + struct er_account *era; + unsigned long flags; + int idx = reg->idx; + + /* + * reg->alloc can be set due to existing state, so for fake cpuc we + * need to ignore this, otherwise we might fail to allocate proper fake + * state for this extra reg constraint. Also see the comment below. + */ + if (reg->alloc && !cpuc->is_fake) + return NULL; /* call x86_get_event_constraint() */ + +again: + era = &cpuc->shared_regs->regs[idx]; + /* + * we use spin_lock_irqsave() to avoid lockdep issues when + * passing a fake cpuc + */ + spin_lock_irqsave(&era->lock, flags); + + if (!atomic_read(&era->ref) || era->config == reg->config) { + + /* + * If its a fake cpuc -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + * + * Not doing the ER fixup will only result in era->reg being + * wrong, but since we won't actually try and program hardware + * this isn't a problem either. + */ + if (!cpuc->is_fake) { + if (idx != reg->idx) + intel_fixup_er(event, idx); + + /* + * x86_schedule_events() can call get_event_constraints() + * multiple times on events in the case of incremental + * scheduling(). reg->alloc ensures we only do the ER + * allocation once. + */ + reg->alloc = 1; + } + + /* lock in msr value */ + era->config = reg->config; + era->reg = reg->reg; + + /* one more user */ + atomic_inc(&era->ref); + + /* + * need to call x86_get_event_constraint() + * to check if associated event has constraints + */ + c = NULL; + } else { + idx = intel_alt_er(idx); + if (idx != reg->idx) { + spin_unlock_irqrestore(&era->lock, flags); + goto again; + } + } + spin_unlock_irqrestore(&era->lock, flags); + + return c; +} + +static void +__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) +{ + struct er_account *era; + + /* + * Only put constraint if extra reg was actually allocated. Also takes + * care of event which do not use an extra shared reg. + * + * Also, if this is a fake cpuc we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent cpuc state + * either since it'll be thrown out. + */ + if (!reg->alloc || cpuc->is_fake) + return; + + era = &cpuc->shared_regs->regs[reg->idx]; + + /* one fewer user */ + atomic_dec(&era->ref); + + /* allocate again next time */ + reg->alloc = 0; +} + +static struct event_constraint * +intel_shared_regs_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct event_constraint *c = NULL, *d; + struct hw_perf_event_extra *xreg, *breg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) { + c = __intel_shared_reg_get_constraints(cpuc, event, xreg); + if (c == &emptyconstraint) + return c; + } + breg = &event->hw.branch_reg; + if (breg->idx != EXTRA_REG_NONE) { + d = __intel_shared_reg_get_constraints(cpuc, event, breg); + if (d == &emptyconstraint) { + __intel_shared_reg_put_constraints(cpuc, xreg); + c = d; + } + } + return c; +} + +struct event_constraint * +x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + if (x86_pmu.event_constraints) { + for_each_event_constraint(c, x86_pmu.event_constraints) { + if ((event->hw.config & c->cmask) == c->code) { + /* hw.flags zeroed at initialization */ + event->hw.flags |= c->flags; + return c; + } + } + } + + return &unconstrained; +} + +static struct event_constraint * +intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c; + + c = intel_bts_constraints(event); + if (c) + return c; + + c = intel_pebs_constraints(event); + if (c) + return c; + + c = intel_shared_regs_constraints(cpuc, event); + if (c) + return c; + + return x86_get_event_constraints(cpuc, event); +} + +static void +intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + + reg = &event->hw.extra_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); + + reg = &event->hw.branch_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); +} + +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + event->hw.flags = 0; + intel_put_shared_regs_event_constraints(cpuc, event); +} + +static void intel_pebs_aliases_core2(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use INST_RETIRED.ANY_P + * (0x00c0), which is a PEBS capable event, to get the same + * count. + * + * INST_RETIRED.ANY_P counts the number of cycles that retires + * CNTMASK instructions. By setting CNTMASK to a value (16) + * larger than the maximum number of instructions that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less instructions, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc0, .inv=1, .cmask=16); + + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static void intel_pebs_aliases_snb(struct perf_event *event) +{ + if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { + /* + * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P + * (0x003c) so that we can use it with PEBS. + * + * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't + * PEBS capable. However we can use UOPS_RETIRED.ALL + * (0x01c2), which is a PEBS capable event, to get the same + * count. + * + * UOPS_RETIRED.ALL counts the number of cycles that retires + * CNTMASK micro-ops. By setting CNTMASK to a value (16) + * larger than the maximum number of micro-ops that can be + * retired per cycle (4) and then inverting the condition, we + * count all cycles that retire 16 or less micro-ops, which + * is every cycle. + * + * Thereby we gain a PEBS capable cycle counter. + */ + u64 alt_config = X86_CONFIG(.event=0xc2, .umask=0x01, .inv=1, .cmask=16); + + alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK); + event->hw.config = alt_config; + } +} + +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.precise_ip && x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); + + if (intel_pmu_needs_lbr_smpl(event)) { + ret = intel_pmu_setup_lbr_filter(event); + if (ret) + return ret; + } + + if (event->attr.type != PERF_TYPE_RAW) + return 0; + + if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) + return 0; + + if (x86_pmu.version < 3) + return -EINVAL; + + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY; + + return 0; +} + +struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr) +{ + if (x86_pmu.guest_get_msrs) + return x86_pmu.guest_get_msrs(nr); + *nr = 0; + return NULL; +} +EXPORT_SYMBOL_GPL(perf_guest_get_msrs); + +static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + + arr[0].msr = MSR_CORE_PERF_GLOBAL_CTRL; + arr[0].host = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask; + arr[0].guest = x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_host_mask; + /* + * If PMU counter has PEBS enabled it is not enough to disable counter + * on a guest entry since PEBS memory write can overshoot guest entry + * and corrupt guest memory. Disabling PEBS solves the problem. + */ + arr[1].msr = MSR_IA32_PEBS_ENABLE; + arr[1].host = cpuc->pebs_enabled; + arr[1].guest = 0; + + *nr = 2; + return arr; +} + +static struct perf_guest_switch_msr *core_guest_get_msrs(int *nr) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct perf_guest_switch_msr *arr = cpuc->guest_switch_msrs; + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + + arr[idx].msr = x86_pmu_config_addr(idx); + arr[idx].host = arr[idx].guest = 0; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + arr[idx].host = arr[idx].guest = + event->hw.config | ARCH_PERFMON_EVENTSEL_ENABLE; + + if (event->attr.exclude_host) + arr[idx].host &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + else if (event->attr.exclude_guest) + arr[idx].guest &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + } + + *nr = x86_pmu.num_counters; + return arr; +} + +static void core_pmu_enable_event(struct perf_event *event) +{ + if (!event->attr.exclude_host) + x86_pmu_enable_event(event); +} + +static void core_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct hw_perf_event *hwc = &cpuc->events[idx]->hw; + + if (!test_bit(idx, cpuc->active_mask) || + cpuc->events[idx]->attr.exclude_host) + continue; + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); + } +} + +static int hsw_hw_config(struct perf_event *event) +{ + int ret = intel_pmu_hw_config(event); + + if (ret) + return ret; + if (!boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has(X86_FEATURE_HLE)) + return 0; + event->hw.config |= event->attr.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED); + + /* + * IN_TX/IN_TX-CP filters are not supported by the Haswell PMU with + * PEBS or in ANY thread mode. Since the results are non-sensical forbid + * this combination. + */ + if ((event->hw.config & (HSW_IN_TX|HSW_IN_TX_CHECKPOINTED)) && + ((event->hw.config & ARCH_PERFMON_EVENTSEL_ANY) || + event->attr.precise_ip > 0)) + return -EOPNOTSUPP; + + if (event_is_checkpointed(event)) { + /* + * Sampling of checkpointed events can cause situations where + * the CPU constantly aborts because of a overflow, which is + * then checkpointed back and ignored. Forbid checkpointing + * for sampling. + * + * But still allow a long sampling period, so that perf stat + * from KVM works. + */ + if (event->attr.sample_period > 0 && + event->attr.sample_period < 0x7fffffff) + return -EOPNOTSUPP; + } + return 0; +} + +static struct event_constraint counter2_constraint = + EVENT_CONSTRAINT(0, 0x4, 0); + +static struct event_constraint * +hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +{ + struct event_constraint *c = intel_get_event_constraints(cpuc, event); + + /* Handle special quirk on in_tx_checkpointed only in counter 2 */ + if (event->hw.config & HSW_IN_TX_CHECKPOINTED) { + if (c->idxmsk64 & (1U << 2)) + return &counter2_constraint; + return &emptyconstraint; + } + + return c; +} + +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(any, "config:21" ); /* v3 + */ +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); +PMU_FORMAT_ATTR(in_tx, "config:32"); +PMU_FORMAT_ATTR(in_tx_cp, "config:33"); + +static struct attribute *intel_arch_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +ssize_t intel_event_sysfs_show(char *page, u64 config) +{ + u64 event = (config & ARCH_PERFMON_EVENTSEL_EVENT); + + return x86_event_sysfs_show(page, config, event); +} + +static __initconst const struct x86_pmu core_pmu = { + .name = "core", + .handle_irq = x86_pmu_handle_irq, + .disable_all = x86_pmu_disable_all, + .enable_all = core_pmu_enable_all, + .enable = core_pmu_enable_event, + .disable = x86_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .event_constraints = intel_core_event_constraints, + .guest_get_msrs = core_guest_get_msrs, + .format_attrs = intel_arch_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, +}; + +struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + struct intel_shared_regs *regs; + int i; + + regs = kzalloc_node(sizeof(struct intel_shared_regs), + GFP_KERNEL, cpu_to_node(cpu)); + if (regs) { + /* + * initialize the locks to keep lockdep happy + */ + for (i = 0; i < EXTRA_REG_MAX; i++) + spin_lock_init(®s->regs[i].lock); + + regs->core_id = -1; + } + return regs; +} + +static int intel_pmu_cpu_prepare(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + + if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map)) + return NOTIFY_OK; + + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static void intel_pmu_cpu_starting(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + int core_id = topology_core_id(cpu); + int i; + + init_debug_store_on_cpu(cpu); + /* + * Deal with CPUs that don't clear their LBRs on power-up. + */ + intel_pmu_lbr_reset(); + + cpuc->lbr_sel = NULL; + + if (!cpuc->shared_regs) + return; + + if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) { + for_each_cpu(i, topology_thread_cpumask(cpu)) { + struct intel_shared_regs *pc; + + pc = per_cpu(cpu_hw_events, i).shared_regs; + if (pc && pc->core_id == core_id) { + cpuc->kfree_on_online = cpuc->shared_regs; + cpuc->shared_regs = pc; + break; + } + } + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; + } + + if (x86_pmu.lbr_sel_map) + cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR]; +} + +static void intel_pmu_cpu_dying(int cpu) +{ + struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); + struct intel_shared_regs *pc; + + pc = cpuc->shared_regs; + if (pc) { + if (pc->core_id == -1 || --pc->refcnt == 0) + kfree(pc); + cpuc->shared_regs = NULL; + } + + fini_debug_store_on_cpu(cpu); +} + +PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); + +PMU_FORMAT_ATTR(ldlat, "config1:0-15"); + +static struct attribute *intel_arch3_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_any.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + &format_attr_in_tx.attr, + &format_attr_in_tx_cp.attr, + + &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ + &format_attr_ldlat.attr, /* PEBS load latency */ + NULL, +}; + +static void intel_pmu_flush_branch_stack(void) +{ + /* + * Intel LBR does not tag entries with the + * PID of the current task, then we need to + * flush it on ctxsw + * For now, we simply reset it + */ + if (x86_pmu.lbr_nr) + intel_pmu_lbr_reset(); +} + +static __initconst const struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .hw_config = intel_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .get_event_constraints = intel_get_event_constraints, + .put_event_constraints = intel_put_event_constraints, + .pebs_aliases = intel_pebs_aliases_core2, + + .format_attrs = intel_arch3_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + + .cpu_prepare = intel_pmu_cpu_prepare, + .cpu_starting = intel_pmu_cpu_starting, + .cpu_dying = intel_pmu_cpu_dying, + .guest_get_msrs = intel_guest_get_msrs, + .flush_branch_stack = intel_pmu_flush_branch_stack, +}; + +static __init void intel_clovertown_quirk(void) +{ + /* + * PEBS is unreliable due to: + * + * AJ67 - PEBS may experience CPL leaks + * AJ68 - PEBS PMI may be delayed by one event + * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12] + * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS + * + * AJ67 could be worked around by restricting the OS/USR flags. + * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI. + * + * AJ106 could possibly be worked around by not allowing LBR + * usage from PEBS, including the fixup. + * AJ68 could possibly be worked around by always programming + * a pebs_event_reset[0] value and coping with the lost events. + * + * But taken together it might just make sense to not enable PEBS on + * these chips. + */ + printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + x86_pmu.pebs = 0; + x86_pmu.pebs_constraints = NULL; +} + +static int intel_snb_pebs_broken(int cpu) +{ + u32 rev = UINT_MAX; /* default to broken for unknown models */ + + switch (cpu_data(cpu).x86_model) { + case 42: /* SNB */ + rev = 0x28; + break; + + case 45: /* SNB-EP */ + switch (cpu_data(cpu).x86_mask) { + case 6: rev = 0x618; break; + case 7: rev = 0x70c; break; + } + } + + return (cpu_data(cpu).microcode < rev); +} + +static void intel_snb_check_microcode(void) +{ + int pebs_broken = 0; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + if ((pebs_broken = intel_snb_pebs_broken(cpu))) + break; + } + put_online_cpus(); + + if (pebs_broken == x86_pmu.pebs_broken) + return; + + /* + * Serialized by the microcode lock.. + */ + if (x86_pmu.pebs_broken) { + pr_info("PEBS enabled due to microcode update\n"); + x86_pmu.pebs_broken = 0; + } else { + pr_info("PEBS disabled due to CPU errata, please upgrade microcode\n"); + x86_pmu.pebs_broken = 1; + } +} + +static __init void intel_sandybridge_quirk(void) +{ + x86_pmu.check_microcode = intel_snb_check_microcode; + intel_snb_check_microcode(); +} + +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void intel_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { + intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; + printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); + } +} + +static __init void intel_nehalem_quirk(void) +{ + union cpuid10_ebx ebx; + + ebx.full = x86_pmu.events_maskl; + if (ebx.split.no_branch_misses_retired) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + ebx.split.no_branch_misses_retired = 0; + x86_pmu.events_maskl = ebx.full; + printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + } +} + +EVENT_ATTR_STR(mem-loads, mem_ld_hsw, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_hsw, "event=0xd0,umask=0x82") + +/* Haswell special events */ +EVENT_ATTR_STR(tx-start, tx_start, "event=0xc9,umask=0x1"); +EVENT_ATTR_STR(tx-commit, tx_commit, "event=0xc9,umask=0x2"); +EVENT_ATTR_STR(tx-abort, tx_abort, "event=0xc9,umask=0x4"); +EVENT_ATTR_STR(tx-capacity, tx_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(tx-conflict, tx_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(el-start, el_start, "event=0xc8,umask=0x1"); +EVENT_ATTR_STR(el-commit, el_commit, "event=0xc8,umask=0x2"); +EVENT_ATTR_STR(el-abort, el_abort, "event=0xc8,umask=0x4"); +EVENT_ATTR_STR(el-capacity, el_capacity, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-conflict, el_conflict, "event=0x54,umask=0x1"); +EVENT_ATTR_STR(cycles-t, cycles_t, "event=0x3c,in_tx=1"); +EVENT_ATTR_STR(cycles-ct, cycles_ct, "event=0x3c,in_tx=1,in_tx_cp=1"); + +static struct attribute *hsw_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_capacity), + EVENT_PTR(tx_conflict), + EVENT_PTR(el_start), + EVENT_PTR(el_commit), + EVENT_PTR(el_abort), + EVENT_PTR(el_capacity), + EVENT_PTR(el_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL +}; + +__init int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + union cpuid10_ebx ebx; + struct event_constraint *c; + unsigned int unused; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + switch (boot_cpu_data.x86) { + case 0x6: + return p6_pmu_init(); + case 0xf: + return p4_pmu_init(); + } + return -ENODEV; + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + x86_pmu = core_pmu; + else + x86_pmu = intel_pmu; + + x86_pmu.version = version; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + x86_pmu.max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, x86_pmu.num_counters); + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + if (version > 1) + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + + /* + * v2 and above have a perf capabilities MSR + */ + if (version > 1) { + u64 capabilities; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); + x86_pmu.intel_cap.capabilities = capabilities; + } + + intel_ds_init(); + + x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 14: /* 65 nm core solo/duo, "Yonah" */ + pr_cont("Core events, "); + break; + + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + x86_add_quirk(intel_clovertown_quirk); + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + intel_pmu_lbr_init_core(); + + x86_pmu.event_constraints = intel_core2_event_constraints; + x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints; + pr_cont("Core2 events, "); + break; + + case 26: /* 45 nm nehalem, "Bloomfield" */ + case 30: /* 45 nm nehalem, "Lynnfield" */ + case 46: /* 45 nm nehalem-ex, "Beckton" */ + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_nehalem_event_constraints; + x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; + x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.extra_regs = intel_nehalem_extra_regs; + + x86_pmu.cpu_events = nhm_events_attrs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + + x86_add_quirk(intel_nehalem_quirk); + + pr_cont("Nehalem events, "); + break; + + case 28: /* Atom */ + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_gen_event_constraints; + x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints; + pr_cont("Atom events, "); + break; + + case 55: /* Atom 22nm "Silvermont" */ + case 77: /* Avoton "Silvermont" */ + memcpy(hw_cache_event_ids, slm_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, slm_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_atom(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints; + x86_pmu.extra_regs = intel_slm_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + pr_cont("Silvermont events, "); + break; + + case 37: /* 32 nm nehalem, "Clarkdale" */ + case 44: /* 32 nm nehalem, "Gulftown" */ + case 47: /* 32 nm Xeon E7 */ + memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_nhm(); + + x86_pmu.event_constraints = intel_westmere_event_constraints; + x86_pmu.enable_all = intel_pmu_nhm_enable_all; + x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; + x86_pmu.extra_regs = intel_westmere_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + + x86_pmu.cpu_events = nhm_events_attrs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x3f, .inv=1, .cmask=1); + + pr_cont("Westmere events, "); + break; + + case 42: /* SandyBridge */ + case 45: /* SandyBridge, "Romely-EP" */ + x86_add_quirk(intel_sandybridge_quirk); + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_nhm(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_snb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + if (boot_cpu_data.x86_model == 45) + x86_pmu.extra_regs = intel_snbep_extra_regs; + else + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.cpu_events = snb_events_attrs; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = + X86_CONFIG(.event=0xb1, .umask=0x01, .inv=1, .cmask=1); + + pr_cont("SandyBridge events, "); + break; + case 58: /* IvyBridge */ + case 62: /* IvyBridge EP */ + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_snb_event_constraints; + x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + if (boot_cpu_data.x86_model == 62) + x86_pmu.extra_regs = intel_snbep_extra_regs; + else + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.cpu_events = snb_events_attrs; + + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = + X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + + pr_cont("IvyBridge events, "); + break; + + + case 60: /* Haswell Client */ + case 70: + case 71: + case 63: + case 69: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + + intel_pmu_lbr_init_snb(); + + x86_pmu.event_constraints = intel_hsw_event_constraints; + x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints; + x86_pmu.extra_regs = intel_snb_extra_regs; + x86_pmu.pebs_aliases = intel_pebs_aliases_snb; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = hsw_get_event_constraints; + x86_pmu.cpu_events = hsw_events_attrs; + x86_pmu.lbr_double_abort = true; + pr_cont("Haswell events, "); + break; + + default: + switch (x86_pmu.version) { + case 1: + x86_pmu.event_constraints = intel_v1_event_constraints; + pr_cont("generic architected perfmon v1, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + break; + } + } + + if (x86_pmu.num_counters > INTEL_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_counters, INTEL_PMC_MAX_GENERIC); + x86_pmu.num_counters = INTEL_PMC_MAX_GENERIC; + } + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + + if (x86_pmu.num_counters_fixed > INTEL_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_counters_fixed, INTEL_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = INTEL_PMC_MAX_FIXED; + } + + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << INTEL_PMC_IDX_FIXED; + + if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ + for_each_event_constraint(c, x86_pmu.event_constraints) { + if (c->cmask != FIXED_EVENT_FLAGS + || c->idxmsk64 == INTEL_PMC_MSK_FIXED_REF_CYCLES) { + continue; + } + + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; + } + } + + /* Support full width counters using alternative MSR range */ + if (x86_pmu.intel_cap.full_width_write) { + x86_pmu.max_period = x86_pmu.cntval_mask; + x86_pmu.perfctr = MSR_IA32_PMC0; + pr_cont("full-width counters, "); + } + + return 0; +} diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel_ds.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel_ds.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel_ds.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel_ds.c 2015-03-10 13:24:14.000000000 -0700 @@ -0,0 +1,1069 @@ +#include +#include +#include + +#include +#include + +#include "perf_event.h" + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SIZE PAGE_SIZE + +/* + * pebs_record_32 for p4 and core not supported + +struct pebs_record_32 { + u32 flags, ip; + u32 ax, bc, cx, dx; + u32 si, di, bp, sp; +}; + + */ + +union intel_x86_pebs_dse { + u64 val; + struct { + unsigned int ld_dse:4; + unsigned int ld_stlb_miss:1; + unsigned int ld_locked:1; + unsigned int ld_reserved:26; + }; + struct { + unsigned int st_l1d_hit:1; + unsigned int st_reserved1:3; + unsigned int st_stlb_miss:1; + unsigned int st_locked:1; + unsigned int st_reserved2:26; + }; +}; + + +/* + * Map PEBS Load Latency Data Source encodings to generic + * memory data source information + */ +#define P(a, b) PERF_MEM_S(a, b) +#define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) + +static const u64 pebs_data_source[] = { + P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ +}; + +static u64 precise_store_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2); + + dse.val = status; + + /* + * bit 4: TLB access + * 1 = stored missed 2nd level TLB + * + * so it either hit the walker or the OS + * otherwise hit 2nd level TLB + */ + if (dse.st_stlb_miss) + val |= P(TLB, MISS); + else + val |= P(TLB, HIT); + + /* + * bit 0: hit L1 data cache + * if not set, then all we know is that + * it missed L1D + */ + if (dse.st_l1d_hit) + val |= P(LVL, HIT); + else + val |= P(LVL, MISS); + + /* + * bit 5: Locked prefix + */ + if (dse.st_locked) + val |= P(LOCK, LOCKED); + + return val; +} + +static u64 precise_store_data_hsw(struct perf_event *event, u64 status) +{ + union perf_mem_data_src dse; + u64 cfg = event->hw.config & INTEL_ARCH_EVENT_MASK; + + dse.val = 0; + dse.mem_op = PERF_MEM_OP_STORE; + dse.mem_lvl = PERF_MEM_LVL_NA; + + /* + * L1 info only valid for following events: + * + * MEM_UOPS_RETIRED.STLB_MISS_STORES + * MEM_UOPS_RETIRED.LOCK_STORES + * MEM_UOPS_RETIRED.SPLIT_STORES + * MEM_UOPS_RETIRED.ALL_STORES + */ + if (cfg != 0x12d0 && cfg != 0x22d0 && cfg != 0x42d0 && cfg != 0x82d0) + return dse.mem_lvl; + + if (status & 1) + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + else + dse.mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + + /* Nothing else supported. Sorry. */ + return dse.val; +} + +static u64 load_latency_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val; + int model = boot_cpu_data.x86_model; + int fam = boot_cpu_data.x86; + + dse.val = status; + + /* + * use the mapping table for bit 0-3 + */ + val = pebs_data_source[dse.ld_dse]; + + /* + * Nehalem models do not support TLB, Lock infos + */ + if (fam == 0x6 && (model == 26 || model == 30 + || model == 31 || model == 46)) { + val |= P(TLB, NA) | P(LOCK, NA); + return val; + } + /* + * bit 4: TLB access + * 0 = did not miss 2nd level TLB + * 1 = missed 2nd level TLB + */ + if (dse.ld_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + /* + * bit 5: locked prefix + */ + if (dse.ld_locked) + val |= P(LOCK, LOCKED); + + return val; +} + +struct pebs_record_core { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; +}; + +struct pebs_record_nhm { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; +}; + +/* + * Same as pebs_record_nhm, with two additional fields. + */ +struct pebs_record_hsw { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; + u64 real_ip, tsx_tuning; +}; + +union hsw_tsx_tuning { + struct { + u32 cycles_last_block : 32, + hle_abort : 1, + rtm_abort : 1, + instruction_abort : 1, + non_instruction_abort : 1, + retry : 1, + data_conflict : 1, + capacity_writes : 1, + capacity_reads : 1; + }; + u64 value; +}; + +#define PEBS_HSW_TSX_FLAGS 0xff00000000ULL + +void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_events, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static int alloc_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh = 1; /* always use a single PEBS record */ + void *buffer; + + if (!x86_pmu.pebs) + return 0; + + buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!buffer)) + return -ENOMEM; + + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; + + ds->pebs_buffer_base = (u64)(unsigned long)buffer; + ds->pebs_index = ds->pebs_buffer_base; + ds->pebs_absolute_maximum = ds->pebs_buffer_base + + max * x86_pmu.pebs_record_size; + + ds->pebs_interrupt_threshold = ds->pebs_buffer_base + + thresh * x86_pmu.pebs_record_size; + + return 0; +} + +static void release_pebs_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.pebs) + return; + + kfree((void *)(unsigned long)ds->pebs_buffer_base); + ds->pebs_buffer_base = 0; +} + +static int alloc_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + int node = cpu_to_node(cpu); + int max, thresh; + void *buffer; + + if (!x86_pmu.bts) + return 0; + + buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!buffer)) + return -ENOMEM; + + max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; + thresh = max / 16; + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = ds->bts_buffer_base + + max * BTS_RECORD_SIZE; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - + thresh * BTS_RECORD_SIZE; + + return 0; +} + +static void release_bts_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds || !x86_pmu.bts) + return; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + ds->bts_buffer_base = 0; +} + +static int alloc_ds_buffer(int cpu) +{ + int node = cpu_to_node(cpu); + struct debug_store *ds; + + ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node); + if (unlikely(!ds)) + return -ENOMEM; + + per_cpu(cpu_hw_events, cpu).ds = ds; + + return 0; +} + +static void release_ds_buffer(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + kfree(ds); +} + +void release_ds_buffers(void) +{ + int cpu; + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + release_pebs_buffer(cpu); + release_bts_buffer(cpu); + release_ds_buffer(cpu); + } + put_online_cpus(); +} + +void reserve_ds_buffers(void) +{ + int bts_err = 0, pebs_err = 0; + int cpu; + + x86_pmu.bts_active = 0; + x86_pmu.pebs_active = 0; + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + if (!x86_pmu.bts) + bts_err = 1; + + if (!x86_pmu.pebs) + pebs_err = 1; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + if (alloc_ds_buffer(cpu)) { + bts_err = 1; + pebs_err = 1; + } + + if (!bts_err && alloc_bts_buffer(cpu)) + bts_err = 1; + + if (!pebs_err && alloc_pebs_buffer(cpu)) + pebs_err = 1; + + if (bts_err && pebs_err) + break; + } + + if (bts_err) { + for_each_possible_cpu(cpu) + release_bts_buffer(cpu); + } + + if (pebs_err) { + for_each_possible_cpu(cpu) + release_pebs_buffer(cpu); + } + + if (bts_err && pebs_err) { + for_each_possible_cpu(cpu) + release_ds_buffer(cpu); + } else { + if (x86_pmu.bts && !bts_err) + x86_pmu.bts_active = 1; + + if (x86_pmu.pebs && !pebs_err) + x86_pmu.pebs_active = 1; + + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); +} + +/* + * BTS + */ + +struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << INTEL_PMC_IDX_FIXED_BTS, 0); + +void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= DEBUGCTLMSR_TR; + debugctlmsr |= DEBUGCTLMSR_BTS; + debugctlmsr |= DEBUGCTLMSR_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT | + DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +int intel_pmu_drain_bts_buffer(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[INTEL_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return 0; + + if (!x86_pmu.bts_active) + return 0; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return 0; + + memset(®s, 0, sizeof(regs)); + + ds->bts_index = ds->bts_buffer_base; + + perf_sample_data_init(&data, 0, event->hw.last_period); + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, header.size * (top - at))) + return 1; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; + return 1; +} + +/* + * PEBS + */ +struct event_constraint intel_core2_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_atom_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_slm_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x0103, 0x1), /* REHABQ.LD_BLOCK_ST_FORWARD_PS */ + INTEL_UEVENT_CONSTRAINT(0x0803, 0x1), /* REHABQ.LD_SPLITS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0204, 0x1), /* MEM_UOPS_RETIRED.L2_HIT_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0404, 0x1), /* MEM_UOPS_RETIRED.L2_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x0804, 0x1), /* MEM_UOPS_RETIRED.DTLB_MISS_LOADS_PS */ + INTEL_UEVENT_CONSTRAINT(0x2004, 0x1), /* MEM_UOPS_RETIRED.HITM_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c4, 0x1), /* BR_INST_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec4, 0x1), /* BR_INST_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xbfc4, 0x1), /* BR_INST_RETIRED.FAR_BRANCH_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc4, 0x1), /* BR_INST_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c4, 0x1), /* BR_INST_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xf9c4, 0x1), /* BR_INST_RETIRED.CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc4, 0x1), /* BR_INST_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfdc4, 0x1), /* BR_INST_RETIRED.REL_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec4, 0x1), /* BR_INST_RETIRED.TAKEN_JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_MISP_RETIRED.ALL_BRANCHES_PS */ + INTEL_UEVENT_CONSTRAINT(0x7ec5, 0x1), /* BR_INST_MISP_RETIRED.JCC_PS */ + INTEL_UEVENT_CONSTRAINT(0xebc5, 0x1), /* BR_INST_MISP_RETIRED.NON_RETURN_IND_PS */ + INTEL_UEVENT_CONSTRAINT(0xf7c5, 0x1), /* BR_INST_MISP_RETIRED.RETURN_PS */ + INTEL_UEVENT_CONSTRAINT(0xfbc5, 0x1), /* BR_INST_MISP_RETIRED.IND_CALL_PS */ + INTEL_UEVENT_CONSTRAINT(0xfec5, 0x1), /* BR_INST_MISP_RETIRED.TAKEN_JCC_PS */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_nehalem_pebs_event_constraints[] = { + INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_westmere_pebs_event_constraints[] = { + INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ + INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */ + INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_snb_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ + INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_ivb_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ + INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ + INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ + INTEL_EVENT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ + EVENT_CONSTRAINT_END +}; + +struct event_constraint intel_hsw_pebs_event_constraints[] = { + INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ + INTEL_PST_HSW_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ + INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ + INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ + INTEL_UEVENT_CONSTRAINT(0x01c5, 0xf), /* BR_MISP_RETIRED.CONDITIONAL */ + INTEL_UEVENT_CONSTRAINT(0x04c5, 0xf), /* BR_MISP_RETIRED.ALL_BRANCHES */ + INTEL_UEVENT_CONSTRAINT(0x20c5, 0xf), /* BR_MISP_RETIRED.NEAR_TAKEN */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.* */ + /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), + /* MEM_UOPS_RETIRED.STLB_MISS_STORES */ + INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), + INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */ + INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */ + /* MEM_UOPS_RETIRED.SPLIT_STORES */ + INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), + INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */ + INTEL_PST_HSW_CONSTRAINT(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */ + INTEL_UEVENT_CONSTRAINT(0x01d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L1_HIT */ + INTEL_UEVENT_CONSTRAINT(0x02d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L2_HIT */ + INTEL_UEVENT_CONSTRAINT(0x04d1, 0xf), /* MEM_LOAD_UOPS_RETIRED.L3_HIT */ + /* MEM_LOAD_UOPS_RETIRED.HIT_LFB */ + INTEL_UEVENT_CONSTRAINT(0x40d1, 0xf), + /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS */ + INTEL_UEVENT_CONSTRAINT(0x01d2, 0xf), + /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT */ + INTEL_UEVENT_CONSTRAINT(0x02d2, 0xf), + /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM */ + INTEL_UEVENT_CONSTRAINT(0x01d3, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04c8, 0xf), /* HLE_RETIRED.Abort */ + INTEL_UEVENT_CONSTRAINT(0x04c9, 0xf), /* RTM_RETIRED.Abort */ + + EVENT_CONSTRAINT_END +}; + +struct event_constraint *intel_pebs_constraints(struct perf_event *event) +{ + struct event_constraint *c; + + if (!event->attr.precise_ip) + return NULL; + + if (x86_pmu.pebs_constraints) { + for_each_event_constraint(c, x86_pmu.pebs_constraints) { + if ((event->hw.config & c->cmask) == c->code) { + event->hw.flags |= c->flags; + return c; + } + } + } + + return &emptyconstraint; +} + +void intel_pmu_pebs_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + + cpuc->pebs_enabled |= 1ULL << hwc->idx; + + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) + cpuc->pebs_enabled |= 1ULL << 63; +} + +void intel_pmu_pebs_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + if (cpuc->enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; +} + +void intel_pmu_pebs_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->pebs_enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); +} + +void intel_pmu_pebs_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->pebs_enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); +} + +static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long from = cpuc->lbr_entries[0].from; + unsigned long old_to, to = cpuc->lbr_entries[0].to; + unsigned long ip = regs->ip; + + /* + * We don't need to fixup if the PEBS assist is fault like + */ + if (!x86_pmu.intel_cap.pebs_trap) + return 1; + + /* + * No LBR entry, no basic block, no rewinding + */ + if (!cpuc->lbr_stack.nr || !from || !to) + return 0; + + /* + * Basic blocks should never cross user/kernel boundaries + */ + if (kernel_ip(ip) != kernel_ip(to)) + return 0; + + /* + * unsigned math, either ip is before the start (impossible) or + * the basic block is larger than 1 page (sanity) + */ + if ((ip - to) > PAGE_SIZE) + return 0; + + /* + * We sampled a branch insn, rewind using the LBR stack + */ + if (ip == to) { + regs->ip = from; + return 1; + } + + do { + struct insn insn; + u8 buf[MAX_INSN_SIZE]; + void *kaddr; + + old_to = to; + if (!kernel_ip(ip)) { + int bytes, size = MAX_INSN_SIZE; + + bytes = copy_from_user_nmi(buf, (void __user *)to, size); + if (bytes != size) + return 0; + + kaddr = buf; + } else + kaddr = (void *)to; + + kernel_insn_init(&insn, kaddr); + insn_get_length(&insn); + to += insn.length; + } while (to < ip); + + if (to == ip) { + regs->ip = old_to; + return 1; + } + + /* + * Even though we decoded the basic block, the instruction stream + * never matched the given IP, either the TO or the IP got corrupted. + */ + return 0; +} + +static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs) +{ + if (pebs->tsx_tuning) { + union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; + return tsx.cycles_last_block; + } + return 0; +} + +static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) +{ + u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + + /* For RTM XABORTs also log the abort code from AX */ + if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) + txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; + return txn; +} + +static void __intel_pmu_pebs_event(struct perf_event *event, + struct pt_regs *iregs, void *__pebs) +{ + /* + * We cast to the biggest pebs_record but are careful not to + * unconditionally access the 'extra' entries. + */ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct pebs_record_hsw *pebs = __pebs; + struct perf_sample_data data; + struct pt_regs regs; + u64 sample_type; + int fll, fst; + + if (!intel_pmu_save_and_restart(event)) + return; + + fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; + fst = event->hw.flags & (PERF_X86_EVENT_PEBS_ST | + PERF_X86_EVENT_PEBS_ST_HSW); + + perf_sample_data_init(&data, 0, event->hw.last_period); + + data.period = event->hw.last_period; + sample_type = event->attr.sample_type; + + /* + * if PEBS-LL or PreciseStore + */ + if (fll || fst) { + /* + * Use latency for weight (only avail with PEBS-LL) + */ + if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) + data.weight = pebs->lat; + + /* + * data.data_src encodes the data source + */ + if (sample_type & PERF_SAMPLE_DATA_SRC) { + if (fll) + data.data_src.val = load_latency_data(pebs->dse); + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST_HSW) + data.data_src.val = + precise_store_data_hsw(event, pebs->dse); + else + data.data_src.val = precise_store_data(pebs->dse); + } + } + + /* + * We use the interrupt regs as a base because the PEBS record + * does not contain a full regs set, specifically it seems to + * lack segment descriptors, which get used by things like + * user_mode(). + * + * In the simple case fix up only the IP and BP,SP regs, for + * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. + * A possible PERF_SAMPLE_REGS will have to transfer all regs. + */ + regs = *iregs; + regs.ip = pebs->ip; + regs.bp = pebs->bp; + regs.sp = pebs->sp; + + if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { + regs.ip = pebs->real_ip; + regs.flags |= PERF_EFLAGS_EXACT; + } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) + regs.flags |= PERF_EFLAGS_EXACT; + else + regs.flags &= ~PERF_EFLAGS_EXACT; + + if ((event->attr.sample_type & PERF_SAMPLE_ADDR) && + x86_pmu.intel_cap.pebs_format >= 1) + data.addr = pebs->dla; + + if (x86_pmu.intel_cap.pebs_format >= 2) { + /* Only set the TSX weight when no memory weight. */ + if ((event->attr.sample_type & PERF_SAMPLE_WEIGHT) && !fll) + data.weight = intel_hsw_weight(pebs); + + if (event->attr.sample_type & PERF_SAMPLE_TRANSACTION) + data.txn = intel_hsw_transaction(pebs); + } + + if (has_branch_stack(event)) + data.br_stack = &cpuc->lbr_stack; + + if (perf_event_overflow(event, &data, ®s)) + x86_pmu_stop(event, 0); +} + +static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct perf_event *event = cpuc->events[0]; /* PMC0 only */ + struct pebs_record_core *at, *top; + int n; + + if (!x86_pmu.pebs_active) + return; + + at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_core *)(unsigned long)ds->pebs_index; + + /* + * Whatever else happens, drain the thing + */ + ds->pebs_index = ds->pebs_buffer_base; + + if (!test_bit(0, cpuc->active_mask)) + return; + + WARN_ON_ONCE(!event); + + if (!event->attr.precise_ip) + return; + + n = top - at; + if (n <= 0) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); + at += n - 1; + + __intel_pmu_pebs_event(event, iregs, at); +} + +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct perf_event *event = NULL; + void *at, *top; + u64 status = 0; + int bit, n; + + if (!x86_pmu.pebs_active) + return; + + at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; + + ds->pebs_index = ds->pebs_buffer_base; + + n = (top - at) / x86_pmu.pebs_record_size; + if (n <= 0) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ONCE(n > x86_pmu.max_pebs_events, + "Unexpected number of pebs records %d\n", n); + + for (; at < top; at += x86_pmu.pebs_record_size) { + struct pebs_record_nhm *p = at; + + for_each_set_bit(bit, (unsigned long *)&p->status, + x86_pmu.max_pebs_events) { + event = cpuc->events[bit]; + if (!test_bit(bit, cpuc->active_mask)) + continue; + + WARN_ON_ONCE(!event); + + if (!event->attr.precise_ip) + continue; + + if (__test_and_set_bit(bit, (unsigned long *)&status)) + continue; + + break; + } + + if (!event || bit >= x86_pmu.max_pebs_events) + continue; + + __intel_pmu_pebs_event(event, iregs, at); + } +} + +/* + * BTS, PEBS probe and setup + */ + +void intel_ds_init(void) +{ + /* + * No support for 32bit formats + */ + if (!boot_cpu_has(X86_FEATURE_DTES64)) + return; + + x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); + x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); + if (x86_pmu.pebs) { + char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; + int format = x86_pmu.intel_cap.pebs_format; + + switch (format) { + case 0: + printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; + break; + + case 1: + printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + break; + + case 2: + pr_cont("PEBS fmt2%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_hsw); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + break; + + default: + printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); + x86_pmu.pebs = 0; + } + } +} + +void perf_restore_debug_store(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ds); +} diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel_lbr.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel_lbr.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel_lbr.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel_lbr.c 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,773 @@ +#include +#include + +#include +#include +#include + +#include "perf_event.h" + +enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS = 0x03, + LBR_FORMAT_EIP_FLAGS2 = 0x04, + LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_EIP_FLAGS2, +}; + +static enum { + LBR_EIP_FLAGS = 1, + LBR_TSX = 2, +} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = { + [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS, + [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX, +}; + +/* + * Intel LBR_SELECT bits + * Intel Vol3a, April 2011, Section 16.7 Table 16-10 + * + * Hardware branch filter (not available on all CPUs) + */ +#define LBR_KERNEL_BIT 0 /* do not capture at ring0 */ +#define LBR_USER_BIT 1 /* do not capture at ring > 0 */ +#define LBR_JCC_BIT 2 /* do not capture conditional branches */ +#define LBR_REL_CALL_BIT 3 /* do not capture relative calls */ +#define LBR_IND_CALL_BIT 4 /* do not capture indirect calls */ +#define LBR_RETURN_BIT 5 /* do not capture near returns */ +#define LBR_IND_JMP_BIT 6 /* do not capture indirect jumps */ +#define LBR_REL_JMP_BIT 7 /* do not capture relative jumps */ +#define LBR_FAR_BIT 8 /* do not capture far branches */ + +#define LBR_KERNEL (1 << LBR_KERNEL_BIT) +#define LBR_USER (1 << LBR_USER_BIT) +#define LBR_JCC (1 << LBR_JCC_BIT) +#define LBR_REL_CALL (1 << LBR_REL_CALL_BIT) +#define LBR_IND_CALL (1 << LBR_IND_CALL_BIT) +#define LBR_RETURN (1 << LBR_RETURN_BIT) +#define LBR_REL_JMP (1 << LBR_REL_JMP_BIT) +#define LBR_IND_JMP (1 << LBR_IND_JMP_BIT) +#define LBR_FAR (1 << LBR_FAR_BIT) + +#define LBR_PLM (LBR_KERNEL | LBR_USER) + +#define LBR_SEL_MASK 0x1ff /* valid bits in LBR_SELECT */ +#define LBR_NOT_SUPP -1 /* LBR filter not supported */ +#define LBR_IGN 0 /* ignored */ + +#define LBR_ANY \ + (LBR_JCC |\ + LBR_REL_CALL |\ + LBR_IND_CALL |\ + LBR_RETURN |\ + LBR_REL_JMP |\ + LBR_IND_JMP |\ + LBR_FAR) + +#define LBR_FROM_FLAG_MISPRED (1ULL << 63) +#define LBR_FROM_FLAG_IN_TX (1ULL << 62) +#define LBR_FROM_FLAG_ABORT (1ULL << 61) + +#define for_each_branch_sample_type(x) \ + for ((x) = PERF_SAMPLE_BRANCH_USER; \ + (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1) + +/* + * x86control flow change classification + * x86control flow changes include branches, interrupts, traps, faults + */ +enum { + X86_BR_NONE = 0, /* unknown */ + + X86_BR_USER = 1 << 0, /* branch target is user */ + X86_BR_KERNEL = 1 << 1, /* branch target is kernel */ + + X86_BR_CALL = 1 << 2, /* call */ + X86_BR_RET = 1 << 3, /* return */ + X86_BR_SYSCALL = 1 << 4, /* syscall */ + X86_BR_SYSRET = 1 << 5, /* syscall return */ + X86_BR_INT = 1 << 6, /* sw interrupt */ + X86_BR_IRET = 1 << 7, /* return from interrupt */ + X86_BR_JCC = 1 << 8, /* conditional */ + X86_BR_JMP = 1 << 9, /* jump */ + X86_BR_IRQ = 1 << 10,/* hw interrupt or trap or fault */ + X86_BR_IND_CALL = 1 << 11,/* indirect calls */ + X86_BR_ABORT = 1 << 12,/* transaction abort */ + X86_BR_IN_TX = 1 << 13,/* in transaction */ + X86_BR_NO_TX = 1 << 14,/* not in transaction */ +}; + +#define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) +#define X86_BR_ANYTX (X86_BR_NO_TX | X86_BR_IN_TX) + +#define X86_BR_ANY \ + (X86_BR_CALL |\ + X86_BR_RET |\ + X86_BR_SYSCALL |\ + X86_BR_SYSRET |\ + X86_BR_INT |\ + X86_BR_IRET |\ + X86_BR_JCC |\ + X86_BR_JMP |\ + X86_BR_IRQ |\ + X86_BR_ABORT |\ + X86_BR_IND_CALL) + +#define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) + +#define X86_BR_ANY_CALL \ + (X86_BR_CALL |\ + X86_BR_IND_CALL |\ + X86_BR_SYSCALL |\ + X86_BR_IRQ |\ + X86_BR_INT) + +static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc); + +/* + * We only support LBR implementations that have FREEZE_LBRS_ON_PMI + * otherwise it becomes near impossible to get a reliable stack. + */ + +static void __intel_pmu_lbr_enable(void) +{ + u64 debugctl; + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_sel) + wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config); + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +} + +static void __intel_pmu_lbr_disable(void) +{ + u64 debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +} + +static void intel_pmu_lbr_reset_32(void) +{ + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) + wrmsrl(x86_pmu.lbr_from + i, 0); +} + +static void intel_pmu_lbr_reset_64(void) +{ + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + wrmsrl(x86_pmu.lbr_from + i, 0); + wrmsrl(x86_pmu.lbr_to + i, 0); + } +} + +void intel_pmu_lbr_reset(void) +{ + if (!x86_pmu.lbr_nr) + return; + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + intel_pmu_lbr_reset_32(); + else + intel_pmu_lbr_reset_64(); +} + +void intel_pmu_lbr_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + /* + * Reset the LBR stack if we changed task context to + * avoid data leaks. + */ + if (event->ctx->task && cpuc->lbr_context != event->ctx) { + intel_pmu_lbr_reset(); + cpuc->lbr_context = event->ctx; + } + cpuc->br_sel = event->hw.branch_reg.reg; + + cpuc->lbr_users++; +} + +void intel_pmu_lbr_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + if (cpuc->enabled && !cpuc->lbr_users) { + __intel_pmu_lbr_disable(); + /* avoid stale pointer */ + cpuc->lbr_context = NULL; + } +} + +void intel_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_users) + __intel_pmu_lbr_enable(); +} + +void intel_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_users) + __intel_pmu_lbr_disable(); +} + +/* + * TOS = most recently recorded branch + */ +static inline u64 intel_pmu_lbr_tos(void) +{ + u64 tos; + + rdmsrl(x86_pmu.lbr_tos, tos); + + return tos; +} + +static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +{ + unsigned long mask = x86_pmu.lbr_nr - 1; + u64 tos = intel_pmu_lbr_tos(); + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + unsigned long lbr_idx = (tos - i) & mask; + union { + struct { + u32 from; + u32 to; + }; + u64 lbr; + } msr_lastbranch; + + rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); + + cpuc->lbr_entries[i].from = msr_lastbranch.from; + cpuc->lbr_entries[i].to = msr_lastbranch.to; + cpuc->lbr_entries[i].mispred = 0; + cpuc->lbr_entries[i].predicted = 0; + cpuc->lbr_entries[i].reserved = 0; + } + cpuc->lbr_stack.nr = i; +} + +/* + * Due to lack of segmentation in Linux the effective address (offset) + * is the same as the linear address, allowing us to merge the LIP and EIP + * LBR formats. + */ +static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +{ + unsigned long mask = x86_pmu.lbr_nr - 1; + int lbr_format = x86_pmu.intel_cap.lbr_format; + u64 tos = intel_pmu_lbr_tos(); + int i; + int out = 0; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + unsigned long lbr_idx = (tos - i) & mask; + u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0; + int skip = 0; + int lbr_flags = lbr_desc[lbr_format]; + + rdmsrl(x86_pmu.lbr_from + lbr_idx, from); + rdmsrl(x86_pmu.lbr_to + lbr_idx, to); + + if (lbr_flags & LBR_EIP_FLAGS) { + mis = !!(from & LBR_FROM_FLAG_MISPRED); + pred = !mis; + skip = 1; + } + if (lbr_flags & LBR_TSX) { + in_tx = !!(from & LBR_FROM_FLAG_IN_TX); + abort = !!(from & LBR_FROM_FLAG_ABORT); + skip = 3; + } + from = (u64)((((s64)from) << skip) >> skip); + + /* + * Some CPUs report duplicated abort records, + * with the second entry not having an abort bit set. + * Skip them here. This loop runs backwards, + * so we need to undo the previous record. + * If the abort just happened outside the window + * the extra entry cannot be removed. + */ + if (abort && x86_pmu.lbr_double_abort && out > 0) + out--; + + cpuc->lbr_entries[out].from = from; + cpuc->lbr_entries[out].to = to; + cpuc->lbr_entries[out].mispred = mis; + cpuc->lbr_entries[out].predicted = pred; + cpuc->lbr_entries[out].in_tx = in_tx; + cpuc->lbr_entries[out].abort = abort; + cpuc->lbr_entries[out].reserved = 0; + out++; + } + cpuc->lbr_stack.nr = out; +} + +void intel_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!cpuc->lbr_users) + return; + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + intel_pmu_lbr_read_32(cpuc); + else + intel_pmu_lbr_read_64(cpuc); + + intel_pmu_lbr_filter(cpuc); +} + +/* + * SW filter is used: + * - in case there is no HW filter + * - in case the HW filter has errata or limitations + */ +static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event) +{ + u64 br_type = event->attr.branch_sample_type; + int mask = 0; + + if (br_type & PERF_SAMPLE_BRANCH_USER) + mask |= X86_BR_USER; + + if (br_type & PERF_SAMPLE_BRANCH_KERNEL) + mask |= X86_BR_KERNEL; + + /* we ignore BRANCH_HV here */ + + if (br_type & PERF_SAMPLE_BRANCH_ANY) + mask |= X86_BR_ANY; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_CALL) + mask |= X86_BR_ANY_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ANY_RETURN) + mask |= X86_BR_RET | X86_BR_IRET | X86_BR_SYSRET; + + if (br_type & PERF_SAMPLE_BRANCH_IND_CALL) + mask |= X86_BR_IND_CALL; + + if (br_type & PERF_SAMPLE_BRANCH_ABORT_TX) + mask |= X86_BR_ABORT; + + if (br_type & PERF_SAMPLE_BRANCH_IN_TX) + mask |= X86_BR_IN_TX; + + if (br_type & PERF_SAMPLE_BRANCH_NO_TX) + mask |= X86_BR_NO_TX; + + /* + * stash actual user request into reg, it may + * be used by fixup code for some CPU + */ + event->hw.branch_reg.reg = mask; +} + +/* + * setup the HW LBR filter + * Used only when available, may not be enough to disambiguate + * all branches, may need the help of the SW filter + */ +static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event) +{ + struct hw_perf_event_extra *reg; + u64 br_type = event->attr.branch_sample_type; + u64 mask = 0, m; + u64 v; + + for_each_branch_sample_type(m) { + if (!(br_type & m)) + continue; + + v = x86_pmu.lbr_sel_map[m]; + if (v == LBR_NOT_SUPP) + return -EOPNOTSUPP; + + if (v != LBR_IGN) + mask |= v; + } + reg = &event->hw.branch_reg; + reg->idx = EXTRA_REG_LBR; + + /* LBR_SELECT operates in suppress mode so invert mask */ + reg->config = ~mask & x86_pmu.lbr_sel_mask; + + return 0; +} + +int intel_pmu_setup_lbr_filter(struct perf_event *event) +{ + int ret = 0; + + /* + * no LBR on this PMU + */ + if (!x86_pmu.lbr_nr) + return -EOPNOTSUPP; + + /* + * setup SW LBR filter + */ + intel_pmu_setup_sw_lbr_filter(event); + + /* + * setup HW LBR filter, if any + */ + if (x86_pmu.lbr_sel_map) + ret = intel_pmu_setup_hw_lbr_filter(event); + + return ret; +} + +/* + * return the type of control flow change at address "from" + * intruction is not necessarily a branch (in case of interrupt). + * + * The branch type returned also includes the priv level of the + * target of the control flow change (X86_BR_USER, X86_BR_KERNEL). + * + * If a branch type is unknown OR the instruction cannot be + * decoded (e.g., text page not present), then X86_BR_NONE is + * returned. + */ +static int branch_type(unsigned long from, unsigned long to, int abort) +{ + struct insn insn; + void *addr; + int bytes, size = MAX_INSN_SIZE; + int ret = X86_BR_NONE; + int ext, to_plm, from_plm; + u8 buf[MAX_INSN_SIZE]; + int is64 = 0; + + to_plm = kernel_ip(to) ? X86_BR_KERNEL : X86_BR_USER; + from_plm = kernel_ip(from) ? X86_BR_KERNEL : X86_BR_USER; + + /* + * maybe zero if lbr did not fill up after a reset by the time + * we get a PMU interrupt + */ + if (from == 0 || to == 0) + return X86_BR_NONE; + + if (abort) + return X86_BR_ABORT | to_plm; + + if (from_plm == X86_BR_USER) { + /* + * can happen if measuring at the user level only + * and we interrupt in a kernel thread, e.g., idle. + */ + if (!current->mm) + return X86_BR_NONE; + + /* may fail if text not present */ + bytes = copy_from_user_nmi(buf, (void __user *)from, size); + if (bytes != size) + return X86_BR_NONE; + + addr = buf; + } else { + /* + * The LBR logs any address in the IP, even if the IP just + * faulted. This means userspace can control the from address. + * Ensure we don't blindy read any address by validating it is + * a known text address. + */ + if (kernel_text_address(from)) + addr = (void *)from; + else + return X86_BR_NONE; + } + + /* + * decoder needs to know the ABI especially + * on 64-bit systems running 32-bit apps + */ +#ifdef CONFIG_X86_64 + is64 = kernel_ip((unsigned long)addr) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, addr, is64); + insn_get_opcode(&insn); + + switch (insn.opcode.bytes[0]) { + case 0xf: + switch (insn.opcode.bytes[1]) { + case 0x05: /* syscall */ + case 0x34: /* sysenter */ + ret = X86_BR_SYSCALL; + break; + case 0x07: /* sysret */ + case 0x35: /* sysexit */ + ret = X86_BR_SYSRET; + break; + case 0x80 ... 0x8f: /* conditional */ + ret = X86_BR_JCC; + break; + default: + ret = X86_BR_NONE; + } + break; + case 0x70 ... 0x7f: /* conditional */ + ret = X86_BR_JCC; + break; + case 0xc2: /* near ret */ + case 0xc3: /* near ret */ + case 0xca: /* far ret */ + case 0xcb: /* far ret */ + ret = X86_BR_RET; + break; + case 0xcf: /* iret */ + ret = X86_BR_IRET; + break; + case 0xcc ... 0xce: /* int */ + ret = X86_BR_INT; + break; + case 0xe8: /* call near rel */ + case 0x9a: /* call far absolute */ + ret = X86_BR_CALL; + break; + case 0xe0 ... 0xe3: /* loop jmp */ + ret = X86_BR_JCC; + break; + case 0xe9 ... 0xeb: /* jmp */ + ret = X86_BR_JMP; + break; + case 0xff: /* call near absolute, call far absolute ind */ + insn_get_modrm(&insn); + ext = (insn.modrm.bytes[0] >> 3) & 0x7; + switch (ext) { + case 2: /* near ind call */ + case 3: /* far ind call */ + ret = X86_BR_IND_CALL; + break; + case 4: + case 5: + ret = X86_BR_JMP; + break; + } + break; + default: + ret = X86_BR_NONE; + } + /* + * interrupts, traps, faults (and thus ring transition) may + * occur on any instructions. Thus, to classify them correctly, + * we need to first look at the from and to priv levels. If they + * are different and to is in the kernel, then it indicates + * a ring transition. If the from instruction is not a ring + * transition instr (syscall, systenter, int), then it means + * it was a irq, trap or fault. + * + * we have no way of detecting kernel to kernel faults. + */ + if (from_plm == X86_BR_USER && to_plm == X86_BR_KERNEL + && ret != X86_BR_SYSCALL && ret != X86_BR_INT) + ret = X86_BR_IRQ; + + /* + * branch priv level determined by target as + * is done by HW when LBR_SELECT is implemented + */ + if (ret != X86_BR_NONE) + ret |= to_plm; + + return ret; +} + +/* + * implement actual branch filter based on user demand. + * Hardware may not exactly satisfy that request, thus + * we need to inspect opcodes. Mismatched branches are + * discarded. Therefore, the number of branches returned + * in PERF_SAMPLE_BRANCH_STACK sample may vary. + */ +static void +intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) +{ + u64 from, to; + int br_sel = cpuc->br_sel; + int i, j, type; + bool compress = false; + + /* if sampling all branches, then nothing to filter */ + if ((br_sel & X86_BR_ALL) == X86_BR_ALL) + return; + + for (i = 0; i < cpuc->lbr_stack.nr; i++) { + + from = cpuc->lbr_entries[i].from; + to = cpuc->lbr_entries[i].to; + + type = branch_type(from, to, cpuc->lbr_entries[i].abort); + if (type != X86_BR_NONE && (br_sel & X86_BR_ANYTX)) { + if (cpuc->lbr_entries[i].in_tx) + type |= X86_BR_IN_TX; + else + type |= X86_BR_NO_TX; + } + + /* if type does not correspond, then discard */ + if (type == X86_BR_NONE || (br_sel & type) != type) { + cpuc->lbr_entries[i].from = 0; + compress = true; + } + } + + if (!compress) + return; + + /* remove all entries with from=0 */ + for (i = 0; i < cpuc->lbr_stack.nr; ) { + if (!cpuc->lbr_entries[i].from) { + j = i; + while (++j < cpuc->lbr_stack.nr) + cpuc->lbr_entries[j-1] = cpuc->lbr_entries[j]; + cpuc->lbr_stack.nr--; + if (!cpuc->lbr_entries[i].from) + continue; + } + i++; + } +} + +/* + * Map interface branch filters onto LBR filters + */ +static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_REL_JMP + | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches + */ + [PERF_SAMPLE_BRANCH_ANY_CALL] = + LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR, + /* + * NHM/WSM erratum: must include IND_JMP to capture IND_CALL + */ + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP, +}; + +static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = { + [PERF_SAMPLE_BRANCH_ANY] = LBR_ANY, + [PERF_SAMPLE_BRANCH_USER] = LBR_USER, + [PERF_SAMPLE_BRANCH_KERNEL] = LBR_KERNEL, + [PERF_SAMPLE_BRANCH_HV] = LBR_IGN, + [PERF_SAMPLE_BRANCH_ANY_RETURN] = LBR_RETURN | LBR_FAR, + [PERF_SAMPLE_BRANCH_ANY_CALL] = LBR_REL_CALL | LBR_IND_CALL + | LBR_FAR, + [PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL, +}; + +/* core */ +void intel_pmu_lbr_init_core(void) +{ + x86_pmu.lbr_nr = 4; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("4-deep LBR, "); +} + +/* nehalem/westmere */ +void intel_pmu_lbr_init_nhm(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = nhm_lbr_sel_map; + + /* + * SW branch filter usage: + * - workaround LBR_SEL errata (see above) + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("16-deep LBR, "); +} + +/* sandy bridge */ +void intel_pmu_lbr_init_snb(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_NHM_FROM; + x86_pmu.lbr_to = MSR_LBR_NHM_TO; + + x86_pmu.lbr_sel_mask = LBR_SEL_MASK; + x86_pmu.lbr_sel_map = snb_lbr_sel_map; + + /* + * SW branch filter usage: + * - support syscall, sysret capture. + * That requires LBR_FAR but that means far + * jmp need to be filtered out + */ + pr_cont("16-deep LBR, "); +} + +/* atom */ +void intel_pmu_lbr_init_atom(void) +{ + /* + * only models starting at stepping 10 seems + * to have an operational LBR which can freeze + * on PMU interrupt + */ + if (boot_cpu_data.x86_mask < 10) { + pr_cont("LBR disabled due to erratum"); + return; + } + + x86_pmu.lbr_nr = 8; + x86_pmu.lbr_tos = MSR_LBR_TOS; + x86_pmu.lbr_from = MSR_LBR_CORE_FROM; + x86_pmu.lbr_to = MSR_LBR_CORE_TO; + + /* + * SW branch filter usage: + * - compensate for lack of HW filter + */ + pr_cont("8-deep LBR, "); +} diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel_uncore.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel_uncore.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel_uncore.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel_uncore.c 2015-03-10 13:24:07.000000000 -0700 @@ -0,0 +1,2998 @@ +#include "perf_event_intel_uncore.h" + +static struct intel_uncore_type *empty_uncore[] = { NULL, }; +static struct intel_uncore_type **msr_uncores = empty_uncore; +static struct intel_uncore_type **pci_uncores = empty_uncore; +/* pci bus to socket mapping */ +static int pcibus_to_physid[256] = { [0 ... 255] = -1, }; + +static DEFINE_SPINLOCK(uncore_box_lock); + +/* mask of cpus that collect uncore events */ +static cpumask_t uncore_cpu_mask; + +/* constraint for the fixed counter */ +static struct event_constraint constraint_fixed = + EVENT_CONSTRAINT(~0ULL, 1 << UNCORE_PMC_IDX_FIXED, ~0ULL); +static struct event_constraint constraint_empty = + EVENT_CONSTRAINT(0, 0, 0); + +DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); +DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); +DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); +DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(cmask8, cmask, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); +DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); +DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); +DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); +DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); +DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band0, filter_band0, "config1:0-7"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band1, filter_band1, "config1:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band2, filter_band2, "config1:16-23"); +DEFINE_UNCORE_FORMAT_ATTR(filter_band3, filter_band3, "config1:24-31"); + +static u64 uncore_msr_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 count; + + rdmsrl(event->hw.event_base, count); + + return count; +} + +/* + * generic get constraint function for shared match/mask registers. + */ +static struct event_constraint * +uncore_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + unsigned long flags; + bool ok = false; + + /* + * reg->alloc can be set due to existing state, so for fake box we + * need to ignore this, otherwise we might fail to allocate proper + * fake state for this extra reg constraint. + */ + if (reg1->idx == EXTRA_REG_NONE || + (!uncore_box_is_fake(box) && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || + (er->config1 == reg1->config && er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (!uncore_box_is_fake(box)) + reg1->alloc = 1; + return NULL; + } + + return &constraint_empty; +} + +static void uncore_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + /* + * Only put constraint if extra reg was actually allocated. Also + * takes care of event which do not use an extra shared reg. + * + * Also, if this is a fake box we shouldn't touch any event state + * (reg->alloc) and we don't care about leaving inconsistent box + * state either since it will be thrown out. + */ + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} + +/* Sandy Bridge-EP uncore support */ +static struct intel_uncore_type snbep_uncore_cbox; +static struct intel_uncore_type snbep_uncore_pcu; + +static void snbep_uncore_pci_disable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + int box_ctl = uncore_pci_box_ctl(box); + u32 config; + + pci_read_config_dword(pdev, box_ctl, &config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + pci_write_config_dword(pdev, box_ctl, config); +} + +static void snbep_uncore_pci_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_pci_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + + pci_write_config_dword(pdev, hwc->config_base, hwc->config); +} + +static u64 snbep_uncore_pci_read_counter(struct intel_uncore_box *box, struct perf_event *event) +{ + struct pci_dev *pdev = box->pci_dev; + struct hw_perf_event *hwc = &event->hw; + u64 count; + + pci_read_config_dword(pdev, hwc->event_base, (u32 *)&count); + pci_read_config_dword(pdev, hwc->event_base + 4, (u32 *)&count + 1); + + return count; +} + +static void snbep_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + pci_write_config_dword(pdev, SNBEP_PCI_PMON_BOX_CTL, SNBEP_PMON_BOX_CTL_INT); +} + +static void snbep_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config |= SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + u64 config; + unsigned msr; + + msr = uncore_msr_box_ctl(box); + if (msr) { + rdmsrl(msr, config); + config &= ~SNBEP_PMON_BOX_CTL_FRZ; + wrmsrl(msr, config); + } +} + +static void snbep_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static void snbep_uncore_msr_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (reg1->idx != EXTRA_REG_NONE) + wrmsrl(reg1->reg, reg1->config); + + wrmsrl(hwc->config_base, hwc->config); +} + +static void snbep_uncore_msr_init_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + + if (msr) + wrmsrl(msr, SNBEP_PMON_BOX_CTL_INT); +} + +static int snbep_uncore_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + + if (box->pmu->type == &snbep_uncore_cbox) { + reg1->reg = SNBEP_C0_MSR_PMON_BOX_FILTER + + SNBEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & + SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK; + } else { + if (box->pmu->type == &snbep_uncore_pcu) { + reg1->reg = SNBEP_PCU_MSR_PMON_BOX_FILTER; + reg1->config = event->attr.config1 & SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK; + } else { + return 0; + } + } + reg1->idx = 0; + + return 0; +} +#if 0 +static struct event_constraint * +snbep_uncore_get_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + unsigned long flags; + bool ok = false; + + if (reg1->idx == EXTRA_REG_NONE || (box->phys_id >= 0 && reg1->alloc)) + return NULL; + + er = &box->shared_regs[reg1->idx]; + raw_spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config1 == reg1->config) { + atomic_inc(&er->ref); + er->config1 = reg1->config; + ok = true; + } + raw_spin_unlock_irqrestore(&er->lock, flags); + + if (ok) { + if (box->phys_id >= 0) + reg1->alloc = 1; + return NULL; + } + return &constraint_empty; +} + +static void snbep_uncore_put_constraint(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + + if (box->phys_id < 0 || !reg1->alloc) + return; + + er = &box->shared_regs[reg1->idx]; + atomic_dec(&er->ref); + reg1->alloc = 0; +} +#endif + +static struct attribute *snbep_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute *snbep_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + NULL, +}; + +static struct attribute *snbep_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid.attr, + &format_attr_filter_nid.attr, + &format_attr_filter_state.attr, + &format_attr_filter_opc.attr, + NULL, +}; + +static struct attribute *snbep_uncore_pcu_formats_attr[] = { + &format_attr_event.attr, + &format_attr_occ_sel.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh5.attr, + &format_attr_occ_invert.attr, + &format_attr_occ_edge.attr, + &format_attr_filter_band0.attr, + &format_attr_filter_band1.attr, + &format_attr_filter_band2.attr, + &format_attr_filter_band3.attr, + NULL, +}; + +static struct attribute *snbep_uncore_qpi_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct uncore_event_desc snbep_uncore_imc_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(cas_count_read, "event=0x04,umask=0x03"), + INTEL_UNCORE_EVENT_DESC(cas_count_write, "event=0x04,umask=0x0c"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc snbep_uncore_qpi_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0x14"), + INTEL_UNCORE_EVENT_DESC(txl_flits_active, "event=0x00,umask=0x06"), + INTEL_UNCORE_EVENT_DESC(drs_data, "event=0x02,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(ncb_data, "event=0x03,umask=0x04"), + { /* end: all zeroes */ }, +}; + +static struct attribute_group snbep_uncore_format_group = { + .name = "format", + .attrs = snbep_uncore_formats_attr, +}; + +static struct attribute_group snbep_uncore_ubox_format_group = { + .name = "format", + .attrs = snbep_uncore_ubox_formats_attr, +}; + +static struct attribute_group snbep_uncore_cbox_format_group = { + .name = "format", + .attrs = snbep_uncore_cbox_formats_attr, +}; + +static struct attribute_group snbep_uncore_pcu_format_group = { + .name = "format", + .attrs = snbep_uncore_pcu_formats_attr, +}; + +static struct attribute_group snbep_uncore_qpi_format_group = { + .name = "format", + .attrs = snbep_uncore_qpi_formats_attr, +}; + +static struct intel_uncore_ops snbep_uncore_msr_ops = { + .init_box = snbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = snbep_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, + .hw_config = snbep_uncore_hw_config, +}; + +static struct intel_uncore_ops snbep_uncore_pci_ops = { + .init_box = snbep_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct event_constraint snbep_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x01, 0x1), + UNCORE_EVENT_CONSTRAINT(0x02, 0x3), + UNCORE_EVENT_CONSTRAINT(0x04, 0x3), + UNCORE_EVENT_CONSTRAINT(0x05, 0x3), + UNCORE_EVENT_CONSTRAINT(0x07, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x3), + UNCORE_EVENT_CONSTRAINT(0x1b, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1c, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1d, 0xc), + UNCORE_EVENT_CONSTRAINT(0x1e, 0xc), + EVENT_CONSTRAINT_OVERLAP(0x1f, 0xe, 0xff), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x35, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + UNCORE_EVENT_CONSTRAINT(0x38, 0x3), + UNCORE_EVENT_CONSTRAINT(0x39, 0x3), + UNCORE_EVENT_CONSTRAINT(0x3b, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x1), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct event_constraint snbep_uncore_r3qpi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x10, 0x3), + UNCORE_EVENT_CONSTRAINT(0x11, 0x3), + UNCORE_EVENT_CONSTRAINT(0x12, 0x3), + UNCORE_EVENT_CONSTRAINT(0x13, 0x1), + UNCORE_EVENT_CONSTRAINT(0x20, 0x3), + UNCORE_EVENT_CONSTRAINT(0x21, 0x3), + UNCORE_EVENT_CONSTRAINT(0x22, 0x3), + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + UNCORE_EVENT_CONSTRAINT(0x24, 0x3), + UNCORE_EVENT_CONSTRAINT(0x25, 0x3), + UNCORE_EVENT_CONSTRAINT(0x26, 0x3), + UNCORE_EVENT_CONSTRAINT(0x30, 0x3), + UNCORE_EVENT_CONSTRAINT(0x31, 0x3), + UNCORE_EVENT_CONSTRAINT(0x32, 0x3), + UNCORE_EVENT_CONSTRAINT(0x33, 0x3), + UNCORE_EVENT_CONSTRAINT(0x34, 0x3), + UNCORE_EVENT_CONSTRAINT(0x36, 0x3), + UNCORE_EVENT_CONSTRAINT(0x37, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snbep_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNBEP_U_MSR_PMON_CTR0, + .event_ctl = SNBEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = SNBEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = SNBEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_ubox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .num_boxes = 8, + .perf_ctr_bits = 44, + .event_ctl = SNBEP_C0_MSR_PMON_CTL0, + .perf_ctr = SNBEP_C0_MSR_PMON_CTR0, + .event_mask = SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = SNBEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = snbep_uncore_cbox_constraints, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_cbox_format_group, +}; + +static struct intel_uncore_type snbep_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCU_MSR_PMON_CTR0, + .event_ctl = SNBEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &snbep_uncore_msr_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *snbep_msr_uncores[] = { + &snbep_uncore_ubox, + &snbep_uncore_cbox, + &snbep_uncore_pcu, + NULL, +}; + +#define SNBEP_UNCORE_PCI_COMMON_INIT() \ + .perf_ctr = SNBEP_PCI_PMON_CTR0, \ + .event_ctl = SNBEP_PCI_PMON_CTL0, \ + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, \ + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, \ + .ops = &snbep_uncore_pci_ops, \ + .format_group = &snbep_uncore_format_group + +static struct intel_uncore_type snbep_uncore_ha = { + .name = "ha", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = snbep_uncore_imc_events, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_qpi = { + .name = "qpi", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &snbep_uncore_pci_ops, + .event_descs = snbep_uncore_qpi_events, + .format_group = &snbep_uncore_qpi_format_group, +}; + + +static struct intel_uncore_type snbep_uncore_r2pcie = { + .name = "r2pcie", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r2pcie_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type snbep_uncore_r3qpi = { + .name = "r3qpi", + .num_counters = 3, + .num_boxes = 2, + .perf_ctr_bits = 44, + .constraints = snbep_uncore_r3qpi_constraints, + SNBEP_UNCORE_PCI_COMMON_INIT(), +}; + +static struct intel_uncore_type *snbep_pci_uncores[] = { + &snbep_uncore_ha, + &snbep_uncore_imc, + &snbep_uncore_qpi, + &snbep_uncore_r2pcie, + &snbep_uncore_r3qpi, + NULL, +}; + +static DEFINE_PCI_DEVICE_TABLE(snbep_uncore_pci_ids) = { + { /* Home Agent */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_HA), + .driver_data = (unsigned long)&snbep_uncore_ha, + }, + { /* MC Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC0), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC1), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC2), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* MC Channel 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_IMC3), + .driver_data = (unsigned long)&snbep_uncore_imc, + }, + { /* QPI Port 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI0), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* QPI Port 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_QPI1), + .driver_data = (unsigned long)&snbep_uncore_qpi, + }, + { /* P2PCIe */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R2PCIE), + .driver_data = (unsigned long)&snbep_uncore_r2pcie, + }, + { /* R3QPI Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI0), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* R3QPI Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_UNC_R3QPI1), + .driver_data = (unsigned long)&snbep_uncore_r3qpi, + }, + { /* end: all zeroes */ } +}; + +static struct pci_driver snbep_uncore_pci_driver = { + .name = "snbep_uncore", + .id_table = snbep_uncore_pci_ids, +}; + +/* + * build pci bus to socket mapping + */ +static void snbep_pci2phy_map_init(void) +{ + struct pci_dev *ubox_dev = NULL; + int i, bus, nodeid; + u32 config; + + while (1) { + /* find the UBOX device */ + ubox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, + PCI_DEVICE_ID_INTEL_JAKETOWN_UBOX, + ubox_dev); + if (!ubox_dev) + break; + bus = ubox_dev->bus->number; + /* get the Node ID of the local register */ + pci_read_config_dword(ubox_dev, 0x40, &config); + nodeid = config; + /* get the Node ID mapping */ + pci_read_config_dword(ubox_dev, 0x54, &config); + /* + * every three bits in the Node ID mapping register maps + * to a particular node. + */ + for (i = 0; i < 8; i++) { + if (nodeid == ((config >> (3 * i)) & 0x7)) { + pcibus_to_physid[bus] = i; + break; + } + } + }; + return; +} +/* end of Sandy Bridge-EP uncore support */ + +/* Sandy Bridge uncore support */ +static void snb_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, SNB_UNC_CTL_EN); +} + +static void snb_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void snb_uncore_msr_init_box(struct intel_uncore_box *box) +{ + if (box->pmu->pmu_idx == 0) { + wrmsrl(SNB_UNC_PERF_GLOBAL_CTL, + SNB_UNC_GLOBAL_CTL_EN | SNB_UNC_GLOBAL_CTL_CORE_ALL); + } +} + +static struct uncore_event_desc snb_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + { /* end: all zeroes */ }, +}; + +static struct attribute *snb_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask5.attr, + NULL, +}; + +static struct attribute_group snb_uncore_format_group = { + .name = "format", + .attrs = snb_uncore_formats_attr, +}; + +static struct intel_uncore_ops snb_uncore_msr_ops = { + .init_box = snb_uncore_msr_init_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = snb_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct event_constraint snb_uncore_cbox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x80, 0x1), + UNCORE_EVENT_CONSTRAINT(0x83, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type snb_uncore_cbox = { + .name = "cbox", + .num_counters = 2, + .num_boxes = 4, + .perf_ctr_bits = 44, + .fixed_ctr_bits = 48, + .perf_ctr = SNB_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = SNB_UNC_CBO_MSR_OFFSET, + .constraints = snb_uncore_cbox_constraints, + .ops = &snb_uncore_msr_ops, + .format_group = &snb_uncore_format_group, + .event_descs = snb_uncore_events, +}; + +static struct intel_uncore_type *snb_msr_uncores[] = { + &snb_uncore_cbox, + NULL, +}; +/* end of Sandy Bridge uncore support */ + +/* Nehalem uncore support */ +static void nhm_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, 0); +} + +static void nhm_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + wrmsrl(NHM_UNC_PERF_GLOBAL_CTL, NHM_UNC_GLOBAL_CTL_EN_PC_ALL | NHM_UNC_GLOBAL_CTL_EN_FC); +} + +static void nhm_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx < UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, hwc->config | SNB_UNC_CTL_EN); + else + wrmsrl(hwc->config_base, NHM_UNC_FIXED_CTR_CTL_EN); +} + +static struct attribute *nhm_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_cmask8.attr, + NULL, +}; + +static struct attribute_group nhm_uncore_format_group = { + .name = "format", + .attrs = nhm_uncore_formats_attr, +}; + +static struct uncore_event_desc nhm_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), + INTEL_UNCORE_EVENT_DESC(qmc_writes_full_any, "event=0x2f,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qmc_normal_reads_any, "event=0x2c,umask=0x0f"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_reads, "event=0x20,umask=0x01"), + INTEL_UNCORE_EVENT_DESC(qhl_request_ioh_writes, "event=0x20,umask=0x02"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_reads, "event=0x20,umask=0x04"), + INTEL_UNCORE_EVENT_DESC(qhl_request_remote_writes, "event=0x20,umask=0x08"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_reads, "event=0x20,umask=0x10"), + INTEL_UNCORE_EVENT_DESC(qhl_request_local_writes, "event=0x20,umask=0x20"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhm_uncore_msr_ops = { + .disable_box = nhm_uncore_msr_disable_box, + .enable_box = nhm_uncore_msr_enable_box, + .disable_event = snb_uncore_msr_disable_event, + .enable_event = nhm_uncore_msr_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type nhm_uncore = { + .name = "", + .num_counters = 8, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .event_ctl = NHM_UNC_PERFEVTSEL0, + .perf_ctr = NHM_UNC_UNCORE_PMC0, + .fixed_ctr = NHM_UNC_FIXED_CTR, + .fixed_ctl = NHM_UNC_FIXED_CTR_CTRL, + .event_mask = NHM_UNC_RAW_EVENT_MASK, + .event_descs = nhm_uncore_events, + .ops = &nhm_uncore_msr_ops, + .format_group = &nhm_uncore_format_group, +}; + +static struct intel_uncore_type *nhm_msr_uncores[] = { + &nhm_uncore, + NULL, +}; +/* end of Nehalem uncore support */ + +/* Nehalem-EX uncore support */ +#define __BITS_VALUE(x, i, n) ((typeof(x))(((x) >> ((i) * (n))) & \ + ((1ULL << (n)) - 1))) + +DEFINE_UNCORE_FORMAT_ATTR(event5, event, "config:1-5"); +DEFINE_UNCORE_FORMAT_ATTR(counter, counter, "config:6-7"); +DEFINE_UNCORE_FORMAT_ATTR(mm_cfg, mm_cfg, "config:63"); +DEFINE_UNCORE_FORMAT_ATTR(match, match, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(mask, mask, "config2:0-63"); + +static void nhmex_uncore_msr_init_box(struct intel_uncore_box *box) +{ + wrmsrl(NHMEX_U_MSR_PMON_GLOBAL_CTL, NHMEX_U_PMON_GLOBAL_EN_ALL); +} + +static void nhmex_uncore_msr_disable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config &= ~((1ULL << uncore_num_counters(box)) - 1); + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config &= ~NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_enable_box(struct intel_uncore_box *box) +{ + unsigned msr = uncore_msr_box_ctl(box); + u64 config; + + if (msr) { + rdmsrl(msr, config); + config |= (1ULL << uncore_num_counters(box)) - 1; + /* WBox has a fixed counter */ + if (uncore_msr_fixed_ctl(box)) + config |= NHMEX_W_PMON_GLOBAL_FIXED_EN; + wrmsrl(msr, config); + } +} + +static void nhmex_uncore_msr_disable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + wrmsrl(event->hw.config_base, 0); +} + +static void nhmex_uncore_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->idx >= UNCORE_PMC_IDX_FIXED) + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0); + else if (box->pmu->type->event_mask & NHMEX_PMON_CTL_EN_BIT0) + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); + else + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +#define NHMEX_UNCORE_OPS_COMMON_INIT() \ + .init_box = nhmex_uncore_msr_init_box, \ + .disable_box = nhmex_uncore_msr_disable_box, \ + .enable_box = nhmex_uncore_msr_enable_box, \ + .disable_event = nhmex_uncore_msr_disable_event, \ + .read_counter = uncore_msr_read_counter + +static struct intel_uncore_ops nhmex_uncore_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_uncore_msr_enable_event, +}; + +static struct attribute *nhmex_uncore_ubox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_edge.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_ubox_format_group = { + .name = "format", + .attrs = nhmex_uncore_ubox_formats_attr, +}; + +static struct intel_uncore_type nhmex_uncore_ubox = { + .name = "ubox", + .num_counters = 1, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_U_MSR_PMON_EV_SEL, + .perf_ctr = NHMEX_U_MSR_PMON_CTR, + .event_mask = NHMEX_U_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_U_MSR_PMON_GLOBAL_CTL, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_ubox_format_group +}; + +static struct attribute *nhmex_uncore_cbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_cbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_cbox_formats_attr, +}; + +/* msr offset for each instance of cbox */ +static unsigned nhmex_cbox_msr_offsets[] = { + 0x0, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x240, 0x2c0, +}; + +static struct intel_uncore_type nhmex_uncore_cbox = { + .name = "cbox", + .num_counters = 6, + .num_boxes = 10, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_C0_MSR_PMON_EV_SEL0, + .perf_ctr = NHMEX_C0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_C0_MSR_PMON_GLOBAL_CTL, + .msr_offsets = nhmex_cbox_msr_offsets, + .pair_ctr_ctl = 1, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static struct uncore_event_desc nhmex_uncore_wbox_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_type nhmex_uncore_wbox = { + .name = "wbox", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_W_MSR_PMON_CNT0, + .perf_ctr = NHMEX_W_MSR_PMON_EVT_SEL0, + .fixed_ctr = NHMEX_W_MSR_PMON_FIXED_CTR, + .fixed_ctl = NHMEX_W_MSR_PMON_FIXED_CTL, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_W_MSR_GLOBAL_CTL, + .pair_ctr_ctl = 1, + .event_descs = nhmex_uncore_wbox_events, + .ops = &nhmex_uncore_ops, + .format_group = &nhmex_uncore_cbox_format_group +}; + +static int nhmex_bbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int ctr, ev_sel; + + ctr = (hwc->config & NHMEX_B_PMON_CTR_MASK) >> + NHMEX_B_PMON_CTR_SHIFT; + ev_sel = (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK) >> + NHMEX_B_PMON_CTL_EV_SEL_SHIFT; + + /* events that do not use the match/mask registers */ + if ((ctr == 0 && ev_sel > 0x3) || (ctr == 1 && ev_sel > 0x6) || + (ctr == 2 && ev_sel != 0x4) || ctr == 3) + return 0; + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_B0_MSR_MATCH; + else + reg1->reg = NHMEX_B1_MSR_MATCH; + reg1->idx = 0; + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + return 0; +} + +static void nhmex_bbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + if (reg1->idx != EXTRA_REG_NONE) { + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, reg2->config); + } + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_B_PMON_CTL_EV_SEL_MASK)); +} + +/* + * The Bbox has 4 counters, but each counter monitors different events. + * Use bits 6-7 in the event config to select counter. + */ +static struct event_constraint nhmex_uncore_bbox_constraints[] = { + EVENT_CONSTRAINT(0 , 1, 0xc0), + EVENT_CONSTRAINT(0x40, 2, 0xc0), + EVENT_CONSTRAINT(0x80, 4, 0xc0), + EVENT_CONSTRAINT(0xc0, 8, 0xc0), + EVENT_CONSTRAINT_END, +}; + +static struct attribute *nhmex_uncore_bbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_counter.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_bbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_bbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_bbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_bbox_msr_enable_event, + .hw_config = nhmex_bbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_bbox = { + .name = "bbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_B0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_B0_MSR_PMON_CTR0, + .event_mask = NHMEX_B_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_B0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_B_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .constraints = nhmex_uncore_bbox_constraints, + .ops = &nhmex_uncore_bbox_ops, + .format_group = &nhmex_uncore_bbox_format_group +}; + +static int nhmex_sbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + + if (event->attr.config & NHMEX_S_PMON_MM_CFG_EN) { + reg1->config = event->attr.config1; + reg2->config = event->attr.config2; + } else { + reg1->config = ~0ULL; + reg2->config = ~0ULL; + } + + if (box->pmu->pmu_idx == 0) + reg1->reg = NHMEX_S0_MSR_MM_CFG; + else + reg1->reg = NHMEX_S1_MSR_MM_CFG; + + reg1->idx = 0; + + return 0; +} + +static void nhmex_sbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + + wrmsrl(reg1->reg, 0); + if (reg1->config != ~0ULL || reg2->config != ~0ULL) { + wrmsrl(reg1->reg + 1, reg1->config); + wrmsrl(reg1->reg + 2, reg2->config); + wrmsrl(reg1->reg, NHMEX_S_PMON_MM_CFG_EN); + } + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT22); +} + +static struct attribute *nhmex_uncore_sbox_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_mm_cfg.attr, + &format_attr_match.attr, + &format_attr_mask.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_sbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_sbox_formats_attr, +}; + +static struct intel_uncore_ops nhmex_uncore_sbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_sbox_msr_enable_event, + .hw_config = nhmex_sbox_hw_config, + .get_constraint = uncore_get_constraint, + .put_constraint = uncore_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_sbox = { + .name = "sbox", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_S0_MSR_PMON_CTL0, + .perf_ctr = NHMEX_S0_MSR_PMON_CTR0, + .event_mask = NHMEX_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_S0_MSR_PMON_GLOBAL_CTL, + .msr_offset = NHMEX_S_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 1, + .ops = &nhmex_uncore_sbox_ops, + .format_group = &nhmex_uncore_sbox_format_group +}; + +enum { + EXTRA_REG_NHMEX_M_FILTER, + EXTRA_REG_NHMEX_M_DSP, + EXTRA_REG_NHMEX_M_ISS, + EXTRA_REG_NHMEX_M_MAP, + EXTRA_REG_NHMEX_M_MSC_THR, + EXTRA_REG_NHMEX_M_PGT, + EXTRA_REG_NHMEX_M_PLD, + EXTRA_REG_NHMEX_M_ZDP_CTL_FVC, +}; + +static struct extra_reg nhmex_uncore_mbox_extra_regs[] = { + MBOX_INC_SEL_EXTAR_REG(0x0, DSP), + MBOX_INC_SEL_EXTAR_REG(0x4, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x5, MSC_THR), + MBOX_INC_SEL_EXTAR_REG(0x9, ISS), + /* event 0xa uses two extra registers */ + MBOX_INC_SEL_EXTAR_REG(0xa, ISS), + MBOX_INC_SEL_EXTAR_REG(0xa, PLD), + MBOX_INC_SEL_EXTAR_REG(0xb, PLD), + /* events 0xd ~ 0x10 use the same extra register */ + MBOX_INC_SEL_EXTAR_REG(0xd, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xe, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0xf, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x10, ZDP_CTL_FVC), + MBOX_INC_SEL_EXTAR_REG(0x16, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x0, DSP), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x1, ISS), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x5, PGT), + MBOX_SET_FLAG_SEL_EXTRA_REG(0x6, MAP), + EVENT_EXTRA_END +}; + +/* Nehalem-EX or Westmere-EX ? */ +bool uncore_nhmex; + +static bool nhmex_mbox_get_shared_reg(struct intel_uncore_box *box, int idx, u64 config) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + bool ret = false; + u64 mask; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + spin_lock_irqsave(&er->lock, flags); + if (!atomic_read(&er->ref) || er->config == config) { + atomic_inc(&er->ref); + er->config = config; + ret = true; + } + spin_unlock_irqrestore(&er->lock, flags); + + return ret; + } + /* + * The ZDP_CTL_FVC MSR has 4 fields which are used to control + * events 0xd ~ 0x10. Besides these 4 fields, there are additional + * fields which are shared. + */ + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (WARN_ON_ONCE(idx >= 4)) + return false; + + /* mask of the shared fields */ + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK; + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + + spin_lock_irqsave(&er->lock, flags); + /* add mask of the non-shared field if it's in use */ + if (__BITS_VALUE(atomic_read(&er->ref), idx, 8)) { + if (uncore_nhmex) + mask |= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask |= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + } + if (!atomic_read(&er->ref) || !((er->config ^ config) & mask)) { + atomic_add(1 << (idx * 8), &er->ref); + if (uncore_nhmex) + mask = NHMEX_M_PMON_ZDP_CTL_FVC_MASK | + NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + mask = WSMEX_M_PMON_ZDP_CTL_FVC_MASK | + WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + er->config &= ~mask; + er->config |= (config & mask); + ret = true; + } + spin_unlock_irqrestore(&er->lock, flags); + + return ret; +} + +static void nhmex_mbox_put_shared_reg(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + er = &box->shared_regs[idx]; + atomic_dec(&er->ref); + return; + } + + idx -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + atomic_sub(1 << (idx * 8), &er->ref); +} + +u64 nhmex_mbox_alter_er(struct perf_event *event, int new_idx, bool modify) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int idx, orig_idx = __BITS_VALUE(reg1->idx, 0, 8); + u64 config = reg1->config; + + /* get the non-shared control bits and shift them */ + idx = orig_idx - EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (uncore_nhmex) + config &= NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + else + config &= WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(idx); + if (new_idx > orig_idx) { + idx = new_idx - orig_idx; + config <<= 3 * idx; + } else { + idx = orig_idx - new_idx; + config >>= 3 * idx; + } + + /* add the shared control bits back */ + if (uncore_nhmex) + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + else + config |= WSMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + config |= NHMEX_M_PMON_ZDP_CTL_FVC_MASK & reg1->config; + if (modify) { + /* adjust the main event selector */ + if (new_idx > orig_idx) + hwc->config += idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + else + hwc->config -= idx << NHMEX_M_PMON_CTL_INC_SEL_SHIFT; + reg1->config = config; + reg1->idx = ~0xff | new_idx; + } + return config; +} + +static struct event_constraint * +nhmex_mbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int i, idx[2], alloc = 0; + u64 config1 = reg1->config; + + idx[0] = __BITS_VALUE(reg1->idx, 0, 8); + idx[1] = __BITS_VALUE(reg1->idx, 1, 8); +again: + for (i = 0; i < 2; i++) { + if (!uncore_box_is_fake(box) && (reg1->alloc & (0x1 << i))) + idx[i] = 0xff; + + if (idx[i] == 0xff) + continue; + + if (!nhmex_mbox_get_shared_reg(box, idx[i], + __BITS_VALUE(config1, i, 32))) + goto fail; + alloc |= (0x1 << i); + } + + /* for the match/mask registers */ + if ((uncore_box_is_fake(box) || !reg2->alloc) && + !nhmex_mbox_get_shared_reg(box, reg2->idx, reg2->config)) + goto fail; + + /* + * If it's a fake box -- as per validate_{group,event}() we + * shouldn't touch event state and we can avoid doing so + * since both will only call get_event_constraints() once + * on each event, this avoids the need for reg->alloc. + */ + if (!uncore_box_is_fake(box)) { + if (idx[0] != 0xff && idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) + nhmex_mbox_alter_er(event, idx[0], true); + reg1->alloc |= alloc; + reg2->alloc = 1; + } + return NULL; +fail: + if (idx[0] != 0xff && !(alloc & 0x1) && + idx[0] >= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) { + /* + * events 0xd ~ 0x10 are functional identical, but are + * controlled by different fields in the ZDP_CTL_FVC + * register. If we failed to take one field, try the + * rest 3 choices. + */ + BUG_ON(__BITS_VALUE(reg1->idx, 1, 8) != 0xff); + idx[0] -= EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + idx[0] = (idx[0] + 1) % 4; + idx[0] += EXTRA_REG_NHMEX_M_ZDP_CTL_FVC; + if (idx[0] != __BITS_VALUE(reg1->idx, 0, 8)) { + config1 = nhmex_mbox_alter_er(event, idx[0], false); + goto again; + } + } + + if (alloc & 0x1) + nhmex_mbox_put_shared_reg(box, idx[0]); + if (alloc & 0x2) + nhmex_mbox_put_shared_reg(box, idx[1]); + return &constraint_empty; +} + +static void nhmex_mbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + + if (uncore_box_is_fake(box)) + return; + + if (reg1->alloc & 0x1) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 0, 8)); + if (reg1->alloc & 0x2) + nhmex_mbox_put_shared_reg(box, __BITS_VALUE(reg1->idx, 1, 8)); + reg1->alloc = 0; + + if (reg2->alloc) { + nhmex_mbox_put_shared_reg(box, reg2->idx); + reg2->alloc = 0; + } +} + +static int nhmex_mbox_extra_reg_idx(struct extra_reg *er) +{ + if (er->idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return er->idx; + return er->idx + (er->event >> NHMEX_M_PMON_CTL_INC_SEL_SHIFT) - 0xd; +} + +static int nhmex_mbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + struct extra_reg *er; + unsigned msr; + int reg_idx = 0; + + if (WARN_ON_ONCE(reg1->idx != -1)) + return -EINVAL; + /* + * The mbox events may require 2 extra MSRs at the most. But only + * the lower 32 bits in these MSRs are significant, so we can use + * config1 to pass two MSRs' config. + */ + for (er = nhmex_uncore_mbox_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + if (event->attr.config1 & ~er->valid_mask) + return -EINVAL; + if (er->idx == __BITS_VALUE(reg1->idx, 0, 8) || + er->idx == __BITS_VALUE(reg1->idx, 1, 8)) + continue; + if (WARN_ON_ONCE(reg_idx >= 2)) + return -EINVAL; + + msr = er->msr + type->msr_offset * box->pmu->pmu_idx; + if (WARN_ON_ONCE(msr >= 0xffff || er->idx >= 0xff)) + return -EINVAL; + + /* always use the 32~63 bits to pass the PLD config */ + if (er->idx == EXTRA_REG_NHMEX_M_PLD) + reg_idx = 1; + + reg1->idx &= ~(0xff << (reg_idx * 8)); + reg1->reg &= ~(0xffff << (reg_idx * 16)); + reg1->idx |= nhmex_mbox_extra_reg_idx(er) << (reg_idx * 8); + reg1->reg |= msr << (reg_idx * 16); + reg1->config = event->attr.config1; + reg_idx++; + } + /* use config2 to pass the filter config */ + reg2->idx = EXTRA_REG_NHMEX_M_FILTER; + if (event->attr.config2 & NHMEX_M_PMON_MM_CFG_EN) + reg2->config = event->attr.config2; + else + reg2->config = ~0ULL; + if (box->pmu->pmu_idx == 0) + reg2->reg = NHMEX_M0_MSR_PMU_MM_CFG; + else + reg2->reg = NHMEX_M1_MSR_PMU_MM_CFG; + + return 0; +} + +static u64 nhmex_mbox_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + if (idx < EXTRA_REG_NHMEX_M_ZDP_CTL_FVC) + return box->shared_regs[idx].config; + + er = &box->shared_regs[EXTRA_REG_NHMEX_M_ZDP_CTL_FVC]; + spin_lock_irqsave(&er->lock, flags); + config = er->config; + spin_unlock_irqrestore(&er->lock, flags); + return config; +} + +static void nhmex_mbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx; + + idx = __BITS_VALUE(reg1->idx, 0, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 0, 16), + nhmex_mbox_shared_reg_config(box, idx)); + idx = __BITS_VALUE(reg1->idx, 1, 8); + if (idx != 0xff) + wrmsrl(__BITS_VALUE(reg1->reg, 1, 16), + nhmex_mbox_shared_reg_config(box, idx)); + + wrmsrl(reg2->reg, 0); + if (reg2->config != ~0ULL) { + wrmsrl(reg2->reg + 1, + reg2->config & NHMEX_M_PMON_ADDR_MATCH_MASK); + wrmsrl(reg2->reg + 2, NHMEX_M_PMON_ADDR_MASK_MASK & + (reg2->config >> NHMEX_M_PMON_ADDR_MASK_SHIFT)); + wrmsrl(reg2->reg, NHMEX_M_PMON_MM_CFG_EN); + } + + wrmsrl(hwc->config_base, hwc->config | NHMEX_PMON_CTL_EN_BIT0); +} + +DEFINE_UNCORE_FORMAT_ATTR(count_mode, count_mode, "config:2-3"); +DEFINE_UNCORE_FORMAT_ATTR(storage_mode, storage_mode, "config:4-5"); +DEFINE_UNCORE_FORMAT_ATTR(wrap_mode, wrap_mode, "config:6"); +DEFINE_UNCORE_FORMAT_ATTR(flag_mode, flag_mode, "config:7"); +DEFINE_UNCORE_FORMAT_ATTR(inc_sel, inc_sel, "config:9-13"); +DEFINE_UNCORE_FORMAT_ATTR(set_flag_sel, set_flag_sel, "config:19-21"); +DEFINE_UNCORE_FORMAT_ATTR(filter_cfg, filter_cfg, "config2:63"); +DEFINE_UNCORE_FORMAT_ATTR(filter_match, filter_match, "config2:0-33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_mask, filter_mask, "config2:34-61"); +DEFINE_UNCORE_FORMAT_ATTR(dsp, dsp, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(thr, thr, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(fvc, fvc, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pgt, pgt, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(map, map, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(iss, iss, "config1:0-31"); +DEFINE_UNCORE_FORMAT_ATTR(pld, pld, "config1:32-63"); + +static struct attribute *nhmex_uncore_mbox_formats_attr[] = { + &format_attr_count_mode.attr, + &format_attr_storage_mode.attr, + &format_attr_wrap_mode.attr, + &format_attr_flag_mode.attr, + &format_attr_inc_sel.attr, + &format_attr_set_flag_sel.attr, + &format_attr_filter_cfg.attr, + &format_attr_filter_match.attr, + &format_attr_filter_mask.attr, + &format_attr_dsp.attr, + &format_attr_thr.attr, + &format_attr_fvc.attr, + &format_attr_pgt.attr, + &format_attr_map.attr, + &format_attr_iss.attr, + &format_attr_pld.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_mbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_mbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x2800"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x2820"), + { /* end: all zeroes */ }, +}; + +static struct uncore_event_desc wsmex_uncore_mbox_events[] = { + INTEL_UNCORE_EVENT_DESC(bbox_cmds_read, "inc_sel=0xd,fvc=0x5000"), + INTEL_UNCORE_EVENT_DESC(bbox_cmds_write, "inc_sel=0xd,fvc=0x5040"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_mbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_mbox_msr_enable_event, + .hw_config = nhmex_mbox_hw_config, + .get_constraint = nhmex_mbox_get_constraint, + .put_constraint = nhmex_mbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_mbox = { + .name = "mbox", + .num_counters = 6, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_M0_MSR_PMU_CTL0, + .perf_ctr = NHMEX_M0_MSR_PMU_CNT0, + .event_mask = NHMEX_M_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_M0_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_M_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 8, + .event_descs = nhmex_uncore_mbox_events, + .ops = &nhmex_uncore_mbox_ops, + .format_group = &nhmex_uncore_mbox_format_group, +}; + +void nhmex_rbox_alter_er(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + int port; + + /* adjust the main event selector */ + if (reg1->idx % 2) { + reg1->idx--; + hwc->config -= 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } else { + reg1->idx++; + hwc->config += 1 << NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + } + + /* adjust address or config of extra register */ + port = reg1->idx / 6 + box->pmu->pmu_idx * 4; + switch (reg1->idx % 6) { + case 0: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); + break; + case 1: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); + break; + case 2: + /* the 8~15 bits to the 0~7 bits */ + reg1->config >>= 8; + break; + case 3: + /* the 0~7 bits to the 8~15 bits */ + reg1->config <<= 8; + break; + case 4: + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); + break; + case 5: + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); + break; + }; +} + +/* + * Each rbox has 4 event set which monitor PQI port 0~3 or 4~7. + * An event set consists of 6 events, the 3rd and 4th events in + * an event set use the same extra register. So an event set uses + * 5 extra registers. + */ +static struct event_constraint * +nhmex_rbox_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + struct intel_uncore_extra_reg *er; + unsigned long flags; + int idx, er_idx; + u64 config1; + bool ok = false; + + if (!uncore_box_is_fake(box) && reg1->alloc) + return NULL; + + idx = reg1->idx % 6; + config1 = reg1->config; +again: + er_idx = idx; + /* the 3rd and 4th events use the same extra register */ + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + spin_lock_irqsave(&er->lock, flags); + if (idx < 2) { + if (!atomic_read(&er->ref) || er->config == reg1->config) { + atomic_inc(&er->ref); + er->config = reg1->config; + ok = true; + } + } else if (idx == 2 || idx == 3) { + /* + * these two events use different fields in a extra register, + * the 0~7 bits and the 8~15 bits respectively. + */ + u64 mask = 0xff << ((idx - 2) * 8); + if (!__BITS_VALUE(atomic_read(&er->ref), idx - 2, 8) || + !((er->config ^ config1) & mask)) { + atomic_add(1 << ((idx - 2) * 8), &er->ref); + er->config &= ~mask; + er->config |= config1 & mask; + ok = true; + } + } else { + if (!atomic_read(&er->ref) || + (er->config == (hwc->config >> 32) && + er->config1 == reg1->config && + er->config2 == reg2->config)) { + atomic_inc(&er->ref); + er->config = (hwc->config >> 32); + er->config1 = reg1->config; + er->config2 = reg2->config; + ok = true; + } + } + spin_unlock_irqrestore(&er->lock, flags); + + if (!ok) { + /* + * The Rbox events are always in pairs. The paired + * events are functional identical, but use different + * extra registers. If we failed to take an extra + * register, try the alternative. + */ + if (idx % 2) + idx--; + else + idx++; + if (idx != reg1->idx % 6) { + if (idx == 2) + config1 >>= 8; + else if (idx == 3) + config1 <<= 8; + goto again; + } + } else { + if (!uncore_box_is_fake(box)) { + if (idx != reg1->idx % 6) + nhmex_rbox_alter_er(box, event); + reg1->alloc = 1; + } + return NULL; + } + return &constraint_empty; +} + +static void nhmex_rbox_put_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_extra_reg *er; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + int idx, er_idx; + + if (uncore_box_is_fake(box) || !reg1->alloc) + return; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + er = &box->shared_regs[er_idx]; + if (idx == 2 || idx == 3) + atomic_sub(1 << ((idx - 2) * 8), &er->ref); + else + atomic_dec(&er->ref); + + reg1->alloc = 0; +} + +static int nhmex_rbox_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct hw_perf_event_extra *reg2 = &event->hw.branch_reg; + int port, idx; + + idx = (event->hw.config & NHMEX_R_PMON_CTL_EV_SEL_MASK) >> + NHMEX_R_PMON_CTL_EV_SEL_SHIFT; + if (idx >= 0x18) + return -EINVAL; + + reg1->idx = idx; + reg1->config = event->attr.config1; + + port = idx / 6 + box->pmu->pmu_idx * 4; + idx %= 6; + switch (idx) { + case 0: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG0(port); + break; + case 1: + reg1->reg = NHMEX_R_MSR_PORTN_IPERF_CFG1(port); + break; + case 2: + case 3: + reg1->reg = NHMEX_R_MSR_PORTN_QLX_CFG(port); + break; + case 4: + case 5: + if (idx == 4) + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(port); + else + reg1->reg = NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(port); + reg2->config = event->attr.config2; + hwc->config |= event->attr.config & (~0ULL << 32); + break; + }; + return 0; +} + +static u64 nhmex_rbox_shared_reg_config(struct intel_uncore_box *box, int idx) +{ + struct intel_uncore_extra_reg *er; + unsigned long flags; + u64 config; + + er = &box->shared_regs[idx]; + + spin_lock_irqsave(&er->lock, flags); + config = er->config; + spin_unlock_irqrestore(&er->lock, flags); + + return config; +} + +static void nhmex_rbox_msr_enable_event(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + struct hw_perf_event_extra *reg1 = &hwc->extra_reg; + struct hw_perf_event_extra *reg2 = &hwc->branch_reg; + int idx, er_idx; + + idx = reg1->idx % 6; + er_idx = idx; + if (er_idx > 2) + er_idx--; + er_idx += (reg1->idx / 6) * 5; + + switch (idx) { + case 0: + case 1: + wrmsrl(reg1->reg, reg1->config); + break; + case 2: + case 3: + wrmsrl(reg1->reg, nhmex_rbox_shared_reg_config(box, er_idx)); + break; + case 4: + case 5: + wrmsrl(reg1->reg, reg1->config); + wrmsrl(reg1->reg + 1, hwc->config >> 32); + wrmsrl(reg1->reg + 2, reg2->config); + break; + }; + + wrmsrl(hwc->config_base, NHMEX_PMON_CTL_EN_BIT0 | + (hwc->config & NHMEX_R_PMON_CTL_EV_SEL_MASK)); +} + +DEFINE_UNCORE_FORMAT_ATTR(xbr_match, xbr_match, "config:32-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mm_cfg, xbr_mm_cfg, "config1:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(xbr_mask, xbr_mask, "config2:0-63"); +DEFINE_UNCORE_FORMAT_ATTR(qlx_cfg, qlx_cfg, "config1:0-15"); +DEFINE_UNCORE_FORMAT_ATTR(iperf_cfg, iperf_cfg, "config1:0-31"); + +static struct attribute *nhmex_uncore_rbox_formats_attr[] = { + &format_attr_event5.attr, + &format_attr_xbr_mm_cfg.attr, + &format_attr_xbr_match.attr, + &format_attr_xbr_mask.attr, + &format_attr_qlx_cfg.attr, + &format_attr_iperf_cfg.attr, + NULL, +}; + +static struct attribute_group nhmex_uncore_rbox_format_group = { + .name = "format", + .attrs = nhmex_uncore_rbox_formats_attr, +}; + +static struct uncore_event_desc nhmex_uncore_rbox_events[] = { + INTEL_UNCORE_EVENT_DESC(qpi0_flit_send, "event=0x0,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_filt_send, "event=0x6,iperf_cfg=0x80000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_idle_filt, "event=0x0,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi1_idle_filt, "event=0x6,iperf_cfg=0x40000000"), + INTEL_UNCORE_EVENT_DESC(qpi0_date_response, "event=0x0,iperf_cfg=0xc4"), + INTEL_UNCORE_EVENT_DESC(qpi1_date_response, "event=0x6,iperf_cfg=0xc4"), + { /* end: all zeroes */ }, +}; + +static struct intel_uncore_ops nhmex_uncore_rbox_ops = { + NHMEX_UNCORE_OPS_COMMON_INIT(), + .enable_event = nhmex_rbox_msr_enable_event, + .hw_config = nhmex_rbox_hw_config, + .get_constraint = nhmex_rbox_get_constraint, + .put_constraint = nhmex_rbox_put_constraint, +}; + +static struct intel_uncore_type nhmex_uncore_rbox = { + .name = "rbox", + .num_counters = 8, + .num_boxes = 2, + .perf_ctr_bits = 48, + .event_ctl = NHMEX_R_MSR_PMON_CTL0, + .perf_ctr = NHMEX_R_MSR_PMON_CNT0, + .event_mask = NHMEX_R_PMON_RAW_EVENT_MASK, + .box_ctl = NHMEX_R_MSR_GLOBAL_CTL, + .msr_offset = NHMEX_R_MSR_OFFSET, + .pair_ctr_ctl = 1, + .num_shared_regs = 20, + .event_descs = nhmex_uncore_rbox_events, + .ops = &nhmex_uncore_rbox_ops, + .format_group = &nhmex_uncore_rbox_format_group +}; + +static struct intel_uncore_type *nhmex_msr_uncores[] = { + &nhmex_uncore_ubox, + &nhmex_uncore_cbox, + &nhmex_uncore_bbox, + &nhmex_uncore_sbox, + &nhmex_uncore_mbox, + &nhmex_uncore_rbox, + &nhmex_uncore_wbox, + NULL, +}; +/* end of Nehalem-EX uncore support */ + +static void uncore_assign_hw_event(struct intel_uncore_box *box, struct perf_event *event, int idx) +{ + struct hw_perf_event *hwc = &event->hw; + + hwc->idx = idx; + hwc->last_tag = ++box->tags[idx]; + + if (hwc->idx == UNCORE_PMC_IDX_FIXED) { + hwc->event_base = uncore_fixed_ctr(box); + hwc->config_base = uncore_fixed_ctl(box); + return; + } + + hwc->config_base = uncore_event_ctl(box, hwc->idx); + hwc->event_base = uncore_perf_ctr(box, hwc->idx); +} + +static void uncore_perf_event_update(struct intel_uncore_box *box, struct perf_event *event) +{ + u64 prev_count, new_count, delta; + int shift; + + if (event->hw.idx >= UNCORE_PMC_IDX_FIXED) + shift = 64 - uncore_fixed_ctr_bits(box); + else + shift = 64 - uncore_perf_ctr_bits(box); + + /* the hrtimer might modify the previous event value */ +again: + prev_count = local64_read(&event->hw.prev_count); + new_count = uncore_read_counter(box, event); + if (local64_xchg(&event->hw.prev_count, new_count) != prev_count) + goto again; + + delta = (new_count << shift) - (prev_count << shift); + delta >>= shift; + + local64_add(delta, &event->count); +} + +/* + * The overflow interrupt is unavailable for SandyBridge-EP, is broken + * for SandyBridge. So we use hrtimer to periodically poll the counter + * to avoid overflow. + */ +static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) +{ + struct intel_uncore_box *box; + unsigned long flags; + int bit; + + box = container_of(hrtimer, struct intel_uncore_box, hrtimer); + if (!box->n_active || box->cpu != smp_processor_id()) + return HRTIMER_NORESTART; + /* + * disable local interrupt to prevent uncore_pmu_event_start/stop + * to interrupt the update process + */ + local_irq_save(flags); + + for_each_set_bit(bit, box->active_mask, UNCORE_PMC_IDX_MAX) + uncore_perf_event_update(box, box->events[bit]); + + local_irq_restore(flags); + + hrtimer_forward_now(hrtimer, ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL)); + return HRTIMER_RESTART; +} + +static void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) +{ + __hrtimer_start_range_ns(&box->hrtimer, + ns_to_ktime(UNCORE_PMU_HRTIMER_INTERVAL), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_cancel(&box->hrtimer); +} + +static void uncore_pmu_init_hrtimer(struct intel_uncore_box *box) +{ + hrtimer_init(&box->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + box->hrtimer.function = uncore_pmu_hrtimer; +} + +struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cpu) +{ + struct intel_uncore_box *box; + int i, size; + + size = sizeof(*box) + type->num_shared_regs * sizeof(struct intel_uncore_extra_reg); + + box = kmalloc_node(size, GFP_KERNEL | __GFP_ZERO, cpu_to_node(cpu)); + if (!box) + return NULL; + + for (i = 0; i < type->num_shared_regs; i++) + spin_lock_init(&box->shared_regs[i].lock); + + uncore_pmu_init_hrtimer(box); + atomic_set(&box->refcnt, 1); + box->cpu = -1; + box->phys_id = -1; + + return box; +} + +static struct intel_uncore_box * +uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu) +{ + static struct intel_uncore_box *box; + + box = *per_cpu_ptr(pmu->box, cpu); + if (box) + return box; + + spin_lock(&uncore_box_lock); + list_for_each_entry(box, &pmu->box_list, list) { + if (box->phys_id == topology_physical_package_id(cpu)) { + atomic_inc(&box->refcnt); + *per_cpu_ptr(pmu->box, cpu) = box; + break; + } + } + spin_unlock(&uncore_box_lock); + + return *per_cpu_ptr(pmu->box, cpu); +} + +static struct intel_uncore_pmu *uncore_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct intel_uncore_pmu, pmu); +} + +static struct intel_uncore_box *uncore_event_to_box(struct perf_event *event) +{ + /* + * perf core schedules event on the basis of cpu, uncore events are + * collected by one of the cpus inside a physical package. + */ + return uncore_pmu_to_box(uncore_event_to_pmu(event), smp_processor_id()); +} + +static int +uncore_collect_events(struct intel_uncore_box *box, struct perf_event *leader, bool dogrp) +{ + struct perf_event *event; + int n, max_count; + + max_count = box->pmu->type->num_counters; + if (box->pmu->type->fixed_ctl) + max_count++; + + if (box->n_events >= max_count) + return -EINVAL; + + n = box->n_events; + box->event_list[n] = leader; + n++; + if (!dogrp) + return n; + + list_for_each_entry(event, &leader->sibling_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + + if (n >= max_count) + return -EINVAL; + + box->event_list[n] = event; + n++; + } + return n; +} + +static struct event_constraint * +uncore_get_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + struct intel_uncore_type *type = box->pmu->type; + struct event_constraint *c; + + if (type->ops->get_constraint) { + c = type->ops->get_constraint(box, event); + if (c) + return c; + } + + if (event->hw.config == ~0ULL) + return &constraint_fixed; + + if (type->constraints) { + for_each_event_constraint(c, type->constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &type->unconstrainted; +} + +static void uncore_put_event_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + if (box->pmu->type->ops->put_constraint) + box->pmu->type->ops->put_constraint(box, event); +} + +static int uncore_assign_events(struct intel_uncore_box *box, int assign[], int n) +{ + unsigned long used_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + struct event_constraint *c, *constraints[UNCORE_PMC_IDX_MAX]; + int i, wmin, wmax, ret = 0; + struct hw_perf_event *hwc; + + bitmap_zero(used_mask, UNCORE_PMC_IDX_MAX); + + for (i = 0, wmin = UNCORE_PMC_IDX_MAX, wmax = 0; i < n; i++) { + c = uncore_get_event_constraint(box, box->event_list[i]); + constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); + } + + /* fastpath, try to reuse previous register */ + for (i = 0; i < n; i++) { + hwc = &box->event_list[i]->hw; + c = constraints[i]; + + /* never assigned */ + if (hwc->idx == -1) + break; + + /* constraint still honored */ + if (!test_bit(hwc->idx, c->idxmsk)) + break; + + /* not already used */ + if (test_bit(hwc->idx, used_mask)) + break; + + __set_bit(hwc->idx, used_mask); + if (assign) + assign[i] = hwc->idx; + } + /* slow path */ + if (i != n) + ret = perf_assign_events(constraints, n, wmin, wmax, assign); + + if (!assign || ret) { + for (i = 0; i < n; i++) + uncore_put_event_constraint(box, box->event_list[i]); + } + return ret ? -EINVAL : 0; +} + +static void uncore_pmu_event_start(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int idx = event->hw.idx; + + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) + return; + + if (WARN_ON_ONCE(idx == -1 || idx >= UNCORE_PMC_IDX_MAX)) + return; + + event->hw.state = 0; + box->events[idx] = event; + box->n_active++; + __set_bit(idx, box->active_mask); + + local64_set(&event->hw.prev_count, uncore_read_counter(box, event)); + uncore_enable_event(box, event); + + if (box->n_active == 1) { + uncore_enable_box(box); + uncore_pmu_start_hrtimer(box); + } +} + +static void uncore_pmu_event_stop(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + + if (__test_and_clear_bit(hwc->idx, box->active_mask)) { + uncore_disable_event(box, event); + box->n_active--; + box->events[hwc->idx] = NULL; + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (box->n_active == 0) { + uncore_disable_box(box); + uncore_pmu_cancel_hrtimer(box); + } + } + + if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + uncore_perf_event_update(box, event); + hwc->state |= PERF_HES_UPTODATE; + } +} + +static int uncore_pmu_event_add(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + struct hw_perf_event *hwc = &event->hw; + int assign[UNCORE_PMC_IDX_MAX]; + int i, n, ret; + + if (!box) + return -ENODEV; + + ret = n = uncore_collect_events(box, event, false); + if (ret < 0) + return ret; + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + if (!(flags & PERF_EF_START)) + hwc->state |= PERF_HES_ARCH; + + ret = uncore_assign_events(box, assign, n); + if (ret) + return ret; + + /* save events moving to new counters */ + for (i = 0; i < box->n_events; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx == assign[i] && + hwc->last_tag == box->tags[assign[i]]) + continue; + /* + * Ensure we don't accidentally enable a stopped + * counter simply because we rescheduled. + */ + if (hwc->state & PERF_HES_STOPPED) + hwc->state |= PERF_HES_ARCH; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + } + + /* reprogram moved events into new counters */ + for (i = 0; i < n; i++) { + event = box->event_list[i]; + hwc = &event->hw; + + if (hwc->idx != assign[i] || + hwc->last_tag != box->tags[assign[i]]) + uncore_assign_hw_event(box, event, assign[i]); + else if (i < box->n_events) + continue; + + if (hwc->state & PERF_HES_ARCH) + continue; + + uncore_pmu_event_start(event, 0); + } + box->n_events = n; + + return 0; +} + +static void uncore_pmu_event_del(struct perf_event *event, int flags) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + int i; + + uncore_pmu_event_stop(event, PERF_EF_UPDATE); + + for (i = 0; i < box->n_events; i++) { + if (event == box->event_list[i]) { + uncore_put_event_constraint(box, event); + + while (++i < box->n_events) + box->event_list[i - 1] = box->event_list[i]; + + --box->n_events; + break; + } + } + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; +} + +static void uncore_pmu_event_read(struct perf_event *event) +{ + struct intel_uncore_box *box = uncore_event_to_box(event); + uncore_perf_event_update(box, event); +} + +/* + * validation ensures the group can be loaded onto the + * PMU if it was the only group available. + */ +static int uncore_validate_group(struct intel_uncore_pmu *pmu, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader; + struct intel_uncore_box *fake_box; + int ret = -EINVAL, n; + + fake_box = uncore_alloc_box(pmu->type, smp_processor_id()); + if (!fake_box) + return -ENOMEM; + + fake_box->pmu = pmu; + /* + * the event is not yet connected with its + * siblings therefore we must first collect + * existing siblings, then add the new event + * before we can simulate the scheduling + */ + n = uncore_collect_events(fake_box, leader, true); + if (n < 0) + goto out; + + fake_box->n_events = n; + n = uncore_collect_events(fake_box, event, false); + if (n < 0) + goto out; + + fake_box->n_events = n; + + ret = uncore_assign_events(fake_box, NULL, n); +out: + kfree(fake_box); + return ret; +} + +int uncore_pmu_event_init(struct perf_event *event) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + struct hw_perf_event *hwc = &event->hw; + int ret; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + pmu = uncore_event_to_pmu(event); + /* no device found for this pmu */ + if (pmu->func_id < 0) + return -ENOENT; + + /* + * Uncore PMU does measure at all privilege level all the time. + * So it doesn't make sense to specify any exclude bits. + */ + if (event->attr.exclude_user || event->attr.exclude_kernel || + event->attr.exclude_hv || event->attr.exclude_idle) + return -EINVAL; + + /* Sampling not supported yet */ + if (hwc->sample_period) + return -EINVAL; + + /* + * Place all uncore events for a particular physical package + * onto a single cpu + */ + if (event->cpu < 0) + return -EINVAL; + box = uncore_pmu_to_box(pmu, event->cpu); + if (!box || box->cpu < 0) + return -EINVAL; + event->cpu = box->cpu; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + + event->hw.idx = -1; + event->hw.last_tag = ~0ULL; + event->hw.extra_reg.idx = EXTRA_REG_NONE; + + if (event->attr.config == UNCORE_FIXED_EVENT) { + /* no fixed counter */ + if (!pmu->type->fixed_ctl) + return -EINVAL; + /* + * if there is only one fixed counter, only the first pmu + * can access the fixed counter + */ + if (pmu->type->single_fixed && pmu->pmu_idx > 0) + return -EINVAL; + hwc->config = ~0ULL; + } else { + hwc->config = event->attr.config & pmu->type->event_mask; + if (pmu->type->ops->hw_config) { + ret = pmu->type->ops->hw_config(box, event); + if (ret) + return ret; + } + } + + if (event->group_leader != event) + ret = uncore_validate_group(pmu, event); + else + ret = 0; + + return ret; +} + +static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu) +{ + int ret; + + pmu->pmu = (struct pmu) { + .attr_groups = pmu->type->attr_groups, + .task_ctx_nr = perf_invalid_context, + .event_init = uncore_pmu_event_init, + .add = uncore_pmu_event_add, + .del = uncore_pmu_event_del, + .start = uncore_pmu_event_start, + .stop = uncore_pmu_event_stop, + .read = uncore_pmu_event_read, + }; + + if (pmu->type->num_boxes == 1) { + if (strlen(pmu->type->name) > 0) + sprintf(pmu->name, "uncore_%s", pmu->type->name); + else + sprintf(pmu->name, "uncore"); + } else { + sprintf(pmu->name, "uncore_%s_%d", pmu->type->name, + pmu->pmu_idx); + } + + ret = perf_pmu_register(&pmu->pmu, pmu->name, -1); + return ret; +} + +static void __init uncore_type_exit(struct intel_uncore_type *type) +{ + int i; + + for (i = 0; i < type->num_boxes; i++) + free_percpu(type->pmus[i].box); + kfree(type->pmus); + type->pmus = NULL; + kfree(type->attr_groups[1]); + type->attr_groups[1] = NULL; +} + +static void uncore_types_exit(struct intel_uncore_type **types) +{ + int i; + for (i = 0; types[i]; i++) + uncore_type_exit(types[i]); +} + +static int __init uncore_type_init(struct intel_uncore_type *type) +{ + struct intel_uncore_pmu *pmus; + struct attribute_group *events_group; + struct attribute **attrs; + int i, j; + + pmus = kzalloc(sizeof(*pmus) * type->num_boxes, GFP_KERNEL); + if (!pmus) + return -ENOMEM; + + type->unconstrainted = (struct event_constraint) + __EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1, + 0, type->num_counters, 0, 0); + + for (i = 0; i < type->num_boxes; i++) { + pmus[i].func_id = -1; + pmus[i].pmu_idx = i; + pmus[i].type = type; + INIT_LIST_HEAD(&pmus[i].box_list); + pmus[i].box = alloc_percpu(struct intel_uncore_box *); + if (!pmus[i].box) + goto fail; + } + + if (type->event_descs) { + i = 0; + while (type->event_descs[i].attr.attr.name) + i++; + + events_group = kzalloc(sizeof(struct attribute *) * (i + 1) + + sizeof(*events_group), GFP_KERNEL); + if (!events_group) + goto fail; + + attrs = (struct attribute **)(events_group + 1); + events_group->name = "events"; + events_group->attrs = attrs; + + for (j = 0; j < i; j++) + attrs[j] = &type->event_descs[j].attr.attr; + + type->attr_groups[1] = events_group; + } + + type->pmus = pmus; + return 0; +fail: + uncore_type_exit(type); + return -ENOMEM; +} + +static int __init uncore_types_init(struct intel_uncore_type **types) +{ + int i, ret; + + for (i = 0; types[i]; i++) { + ret = uncore_type_init(types[i]); + if (ret) + goto fail; + } + return 0; +fail: + while (--i >= 0) + uncore_type_exit(types[i]); + return ret; +} + +static struct pci_driver *uncore_pci_driver; +static bool pcidrv_registered; + +/* + * add a pci uncore device + */ +static int __devinit uncore_pci_add(struct intel_uncore_type *type, struct pci_dev *pdev) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, phys_id; + + phys_id = pcibus_to_physid[pdev->bus->number]; + if (phys_id < 0) + return -ENODEV; + + box = uncore_alloc_box(type, 0); + if (!box) + return -ENOMEM; + + /* + * for performance monitoring unit with multiple boxes, + * each box has a different function id. + */ + for (i = 0; i < type->num_boxes; i++) { + pmu = &type->pmus[i]; + if (pmu->func_id == pdev->devfn) + break; + if (pmu->func_id < 0) { + pmu->func_id = pdev->devfn; + break; + } + pmu = NULL; + } + + if (!pmu) { + kfree(box); + return -EINVAL; + } + + box->phys_id = phys_id; + box->pci_dev = pdev; + box->pmu = pmu; + uncore_box_init(box); + pci_set_drvdata(pdev, box); + + spin_lock(&uncore_box_lock); + list_add_tail(&box->list, &pmu->box_list); + spin_unlock(&uncore_box_lock); + + return 0; +} + +static void uncore_pci_remove(struct pci_dev *pdev) +{ + struct intel_uncore_box *box = pci_get_drvdata(pdev); + struct intel_uncore_pmu *pmu = box->pmu; + int cpu, phys_id = pcibus_to_physid[pdev->bus->number]; + + if (WARN_ON_ONCE(phys_id != box->phys_id)) + return; + + spin_lock(&uncore_box_lock); + list_del(&box->list); + spin_unlock(&uncore_box_lock); + + for_each_possible_cpu(cpu) { + if (*per_cpu_ptr(pmu->box, cpu) == box) { + *per_cpu_ptr(pmu->box, cpu) = NULL; + atomic_dec(&box->refcnt); + } + } + + WARN_ON_ONCE(atomic_read(&box->refcnt) != 1); + kfree(box); +} + +static int __devinit uncore_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct intel_uncore_type *type; + + type = (struct intel_uncore_type *)id->driver_data; + + return uncore_pci_add(type, pdev); +} + +static int __init uncore_pci_init(void) +{ + int ret; + + switch (boot_cpu_data.x86_model) { + case 45: /* Sandy Bridge-EP */ + pci_uncores = snbep_pci_uncores; + uncore_pci_driver = &snbep_uncore_pci_driver; + snbep_pci2phy_map_init(); + break; + default: + return 0; + } + + ret = uncore_types_init(pci_uncores); + if (ret) + return ret; + + uncore_pci_driver->probe = uncore_pci_probe; + uncore_pci_driver->remove = uncore_pci_remove; + + ret = pci_register_driver(uncore_pci_driver); + if (ret == 0) + pcidrv_registered = true; + else + uncore_types_exit(pci_uncores); + + return ret; +} + +static void __init uncore_pci_exit(void) +{ + if (pcidrv_registered) { + pcidrv_registered = false; + pci_unregister_driver(uncore_pci_driver); + uncore_types_exit(pci_uncores); + } +} + +static void __cpuinit uncore_cpu_dying(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + *per_cpu_ptr(pmu->box, cpu) = NULL; + if (box && atomic_dec_and_test(&box->refcnt)) + kfree(box); + } + } +} + +static int __cpuinit uncore_cpu_starting(int cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box, *exist; + int i, j, k, phys_id; + + phys_id = topology_physical_package_id(cpu); + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + box = *per_cpu_ptr(pmu->box, cpu); + /* called by uncore_cpu_init? */ + if (box && box->phys_id >= 0) { + uncore_box_init(box); + continue; + } + + for_each_online_cpu(k) { + exist = *per_cpu_ptr(pmu->box, k); + if (exist && exist->phys_id == phys_id) { + atomic_inc(&exist->refcnt); + *per_cpu_ptr(pmu->box, cpu) = exist; + kfree(box); + box = NULL; + break; + } + } + + if (box) { + box->phys_id = phys_id; + uncore_box_init(box); + } + } + } + return 0; +} + +static int __cpuinit uncore_cpu_prepare(int cpu, int phys_id) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (pmu->func_id < 0) + pmu->func_id = j; + + box = uncore_alloc_box(type, cpu); + if (!box) + return -ENOMEM; + + box->pmu = pmu; + box->phys_id = phys_id; + *per_cpu_ptr(pmu->box, cpu) = box; + } + } + return 0; +} + +static void __cpuinit +uncore_change_context(struct intel_uncore_type **uncores, int old_cpu, int new_cpu) +{ + struct intel_uncore_type *type; + struct intel_uncore_pmu *pmu; + struct intel_uncore_box *box; + int i, j; + + for (i = 0; uncores[i]; i++) { + type = uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + if (old_cpu < 0) + box = uncore_pmu_to_box(pmu, new_cpu); + else + box = uncore_pmu_to_box(pmu, old_cpu); + if (!box) + continue; + + if (old_cpu < 0) { + WARN_ON_ONCE(box->cpu != -1); + box->cpu = new_cpu; + continue; + } + + WARN_ON_ONCE(box->cpu != old_cpu); + if (new_cpu >= 0) { + uncore_pmu_cancel_hrtimer(box); + perf_pmu_migrate_context(&pmu->pmu, + old_cpu, new_cpu); + box->cpu = new_cpu; + } else { + box->cpu = -1; + } + } + } +} + +static void __cpuinit uncore_event_exit_cpu(int cpu) +{ + int i, phys_id, target; + + /* if exiting cpu is used for collecting uncore events */ + if (!cpumask_test_and_clear_cpu(cpu, &uncore_cpu_mask)) + return; + + /* find a new cpu to collect uncore events */ + phys_id = topology_physical_package_id(cpu); + target = -1; + for_each_online_cpu(i) { + if (i == cpu) + continue; + if (phys_id == topology_physical_package_id(i)) { + target = i; + break; + } + } + + /* migrate uncore events to the new cpu */ + if (target >= 0) + cpumask_set_cpu(target, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, cpu, target); + uncore_change_context(pci_uncores, cpu, target); +} + +static void __cpuinit uncore_event_init_cpu(int cpu) +{ + int i, phys_id; + + phys_id = topology_physical_package_id(cpu); + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) + return; + } + + cpumask_set_cpu(cpu, &uncore_cpu_mask); + + uncore_change_context(msr_uncores, -1, cpu); + uncore_change_context(pci_uncores, -1, cpu); +} + +static int + __cpuinit uncore_cpu_notifier(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + /* allocate/free data structure for uncore box */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + uncore_cpu_prepare(cpu, -1); + break; + case CPU_STARTING: + uncore_cpu_starting(cpu); + break; + case CPU_UP_CANCELED: + case CPU_DYING: + uncore_cpu_dying(cpu); + break; + default: + break; + } + + /* select the cpu that collects uncore events */ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_FAILED: + case CPU_STARTING: + uncore_event_init_cpu(cpu); + break; + case CPU_DOWN_PREPARE: + uncore_event_exit_cpu(cpu); + break; + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block uncore_cpu_nb __cpuinitdata = { + .notifier_call = uncore_cpu_notifier, + /* + * to migrate uncore events, our notifier should be executed + * before perf core's notifier. + */ + .priority = CPU_PRI_PERF + 1, +}; + +static void __init uncore_cpu_setup(void *dummy) +{ + uncore_cpu_starting(smp_processor_id()); +} + +static int __init uncore_cpu_init(void) +{ + int ret, cpu, max_cores; + + max_cores = boot_cpu_data.x86_max_cores; + switch (boot_cpu_data.x86_model) { + case 26: /* Nehalem */ + case 30: + case 37: /* Westmere */ + case 44: + msr_uncores = nhm_msr_uncores; + break; + case 42: /* Sandy Bridge */ + if (snb_uncore_cbox.num_boxes > max_cores) + snb_uncore_cbox.num_boxes = max_cores; + msr_uncores = snb_msr_uncores; + break; + case 45: /* Sandy Birdge-EP */ + if (snbep_uncore_cbox.num_boxes > max_cores) + snbep_uncore_cbox.num_boxes = max_cores; + msr_uncores = snbep_msr_uncores; + break; + case 46: /* Nehalem-EX */ + uncore_nhmex = true; + case 47: /* Westmere-EX aka. Xeon E7 */ + if (!uncore_nhmex) + nhmex_uncore_mbox.event_descs = wsmex_uncore_mbox_events; + if (nhmex_uncore_cbox.num_boxes > max_cores) + nhmex_uncore_cbox.num_boxes = max_cores; + msr_uncores = nhmex_msr_uncores; + break; + default: + return 0; + } + + ret = uncore_types_init(msr_uncores); + if (ret) + return ret; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + int i, phys_id = topology_physical_package_id(cpu); + + for_each_cpu(i, &uncore_cpu_mask) { + if (phys_id == topology_physical_package_id(i)) { + phys_id = -1; + break; + } + } + if (phys_id < 0) + continue; + + uncore_cpu_prepare(cpu, phys_id); + uncore_event_init_cpu(cpu); + } + on_each_cpu(uncore_cpu_setup, NULL, 1); + + register_cpu_notifier(&uncore_cpu_nb); + + put_online_cpus(); + + return 0; +} + +static int __init uncore_pmus_register(void) +{ + struct intel_uncore_pmu *pmu; + struct intel_uncore_type *type; + int i, j; + + for (i = 0; msr_uncores[i]; i++) { + type = msr_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + for (i = 0; pci_uncores[i]; i++) { + type = pci_uncores[i]; + for (j = 0; j < type->num_boxes; j++) { + pmu = &type->pmus[j]; + uncore_pmu_register(pmu); + } + } + + return 0; +} + +static int __init intel_uncore_init(void) +{ + int ret; + + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -ENODEV; + + if (cpu_has_hypervisor) + return -ENODEV; + + ret = uncore_pci_init(); + if (ret) + goto fail; + ret = uncore_cpu_init(); + if (ret) { + uncore_pci_exit(); + goto fail; + } + + uncore_pmus_register(); + return 0; +fail: + return ret; +} +device_initcall(intel_uncore_init); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel_uncore.h linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel_uncore.h --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_intel_uncore.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_intel_uncore.h 2015-03-10 13:23:51.000000000 -0700 @@ -0,0 +1,622 @@ +#include +#include +#include +#include +#include "perf_event.h" + +#define UNCORE_PMU_NAME_LEN 32 +#define UNCORE_PMU_HRTIMER_INTERVAL (60LL * NSEC_PER_SEC) + +#define UNCORE_FIXED_EVENT 0xff +#define UNCORE_PMC_IDX_MAX_GENERIC 8 +#define UNCORE_PMC_IDX_FIXED UNCORE_PMC_IDX_MAX_GENERIC +#define UNCORE_PMC_IDX_MAX (UNCORE_PMC_IDX_FIXED + 1) + +#define UNCORE_EVENT_CONSTRAINT(c, n) EVENT_CONSTRAINT(c, n, 0xff) + +/* SNB event control */ +#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff +#define SNB_UNC_CTL_UMASK_MASK 0x0000ff00 +#define SNB_UNC_CTL_EDGE_DET (1 << 18) +#define SNB_UNC_CTL_EN (1 << 22) +#define SNB_UNC_CTL_INVERT (1 << 23) +#define SNB_UNC_CTL_CMASK_MASK 0x1f000000 +#define NHM_UNC_CTL_CMASK_MASK 0xff000000 +#define NHM_UNC_FIXED_CTR_CTL_EN (1 << 0) + +#define SNB_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + SNB_UNC_CTL_CMASK_MASK) + +#define NHM_UNC_RAW_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \ + SNB_UNC_CTL_UMASK_MASK | \ + SNB_UNC_CTL_EDGE_DET | \ + SNB_UNC_CTL_INVERT | \ + NHM_UNC_CTL_CMASK_MASK) + +/* SNB global control register */ +#define SNB_UNC_PERF_GLOBAL_CTL 0x391 +#define SNB_UNC_FIXED_CTR_CTRL 0x394 +#define SNB_UNC_FIXED_CTR 0x395 + +/* SNB uncore global control */ +#define SNB_UNC_GLOBAL_CTL_CORE_ALL ((1 << 4) - 1) +#define SNB_UNC_GLOBAL_CTL_EN (1 << 29) + +/* SNB Cbo register */ +#define SNB_UNC_CBO_0_PERFEVTSEL0 0x700 +#define SNB_UNC_CBO_0_PER_CTR0 0x706 +#define SNB_UNC_CBO_MSR_OFFSET 0x10 + +/* NHM global control register */ +#define NHM_UNC_PERF_GLOBAL_CTL 0x391 +#define NHM_UNC_FIXED_CTR 0x394 +#define NHM_UNC_FIXED_CTR_CTRL 0x395 + +/* NHM uncore global control */ +#define NHM_UNC_GLOBAL_CTL_EN_PC_ALL ((1ULL << 8) - 1) +#define NHM_UNC_GLOBAL_CTL_EN_FC (1ULL << 32) + +/* NHM uncore register */ +#define NHM_UNC_PERFEVTSEL0 0x3c0 +#define NHM_UNC_UNCORE_PMC0 0x3b0 + +/* SNB-EP Box level control */ +#define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) +#define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) +#define SNBEP_PMON_BOX_CTL_FRZ (1 << 8) +#define SNBEP_PMON_BOX_CTL_FRZ_EN (1 << 16) +#define SNBEP_PMON_BOX_CTL_INT (SNBEP_PMON_BOX_CTL_RST_CTRL | \ + SNBEP_PMON_BOX_CTL_RST_CTRS | \ + SNBEP_PMON_BOX_CTL_FRZ_EN) +/* SNB-EP event control */ +#define SNBEP_PMON_CTL_EV_SEL_MASK 0x000000ff +#define SNBEP_PMON_CTL_UMASK_MASK 0x0000ff00 +#define SNBEP_PMON_CTL_RST (1 << 17) +#define SNBEP_PMON_CTL_EDGE_DET (1 << 18) +#define SNBEP_PMON_CTL_EV_SEL_EXT (1 << 21) /* only for QPI */ +#define SNBEP_PMON_CTL_EN (1 << 22) +#define SNBEP_PMON_CTL_INVERT (1 << 23) +#define SNBEP_PMON_CTL_TRESH_MASK 0xff000000 +#define SNBEP_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PMON_CTL_TRESH_MASK) + +/* SNB-EP Ubox event control */ +#define SNBEP_U_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_U_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_U_MSR_PMON_CTL_TRESH_MASK) + +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +#define SNBEP_CBO_PMON_CTL_TID_EN (1 << 19) +#define SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_CBO_PMON_CTL_TID_EN) + +/* SNB-EP PCU event control */ +#define SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK 0x0000c000 +#define SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK 0x1f000000 +#define SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT (1 << 30) +#define SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET (1 << 31) +#define SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_SEL_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_TRESH_MASK | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ + SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) + +#define SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK \ + (SNBEP_PMON_RAW_EVENT_MASK | \ + SNBEP_PMON_CTL_EV_SEL_EXT) + +/* SNB-EP pci control register */ +#define SNBEP_PCI_PMON_BOX_CTL 0xf4 +#define SNBEP_PCI_PMON_CTL0 0xd8 +/* SNB-EP pci counter register */ +#define SNBEP_PCI_PMON_CTR0 0xa0 + +/* SNB-EP home agent register */ +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH0 0x40 +#define SNBEP_HA_PCI_PMON_BOX_ADDRMATCH1 0x44 +#define SNBEP_HA_PCI_PMON_BOX_OPCODEMATCH 0x48 +/* SNB-EP memory controller register */ +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTL 0xf0 +#define SNBEP_MC_CHy_PCI_PMON_FIXED_CTR 0xd0 +/* SNB-EP QPI register */ +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH0 0x228 +#define SNBEP_Q_Py_PCI_PMON_PKT_MATCH1 0x22c +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK0 0x238 +#define SNBEP_Q_Py_PCI_PMON_PKT_MASK1 0x23c + +/* SNB-EP Ubox register */ +#define SNBEP_U_MSR_PMON_CTR0 0xc16 +#define SNBEP_U_MSR_PMON_CTL0 0xc10 + +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTL 0xc08 +#define SNBEP_U_MSR_PMON_UCLK_FIXED_CTR 0xc09 + +/* SNB-EP Cbo register */ +#define SNBEP_C0_MSR_PMON_CTR0 0xd16 +#define SNBEP_C0_MSR_PMON_CTL0 0xd10 +#define SNBEP_C0_MSR_PMON_BOX_CTL 0xd04 +#define SNBEP_C0_MSR_PMON_BOX_FILTER 0xd14 +#define SNBEP_CB0_MSR_PMON_BOX_FILTER_MASK 0xfffffc1f +#define SNBEP_CBO_MSR_OFFSET 0x20 + +/* SNB-EP PCU register */ +#define SNBEP_PCU_MSR_PMON_CTR0 0xc36 +#define SNBEP_PCU_MSR_PMON_CTL0 0xc30 +#define SNBEP_PCU_MSR_PMON_BOX_CTL 0xc24 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER 0xc34 +#define SNBEP_PCU_MSR_PMON_BOX_FILTER_MASK 0xffffffff +#define SNBEP_PCU_MSR_CORE_C3_CTR 0x3fc +#define SNBEP_PCU_MSR_CORE_C6_CTR 0x3fd + +/* NHM-EX event control */ +#define NHMEX_PMON_CTL_EV_SEL_MASK 0x000000ff +#define NHMEX_PMON_CTL_UMASK_MASK 0x0000ff00 +#define NHMEX_PMON_CTL_EN_BIT0 (1 << 0) +#define NHMEX_PMON_CTL_EDGE_DET (1 << 18) +#define NHMEX_PMON_CTL_PMI_EN (1 << 20) +#define NHMEX_PMON_CTL_EN_BIT22 (1 << 22) +#define NHMEX_PMON_CTL_INVERT (1 << 23) +#define NHMEX_PMON_CTL_TRESH_MASK 0xff000000 +#define NHMEX_PMON_RAW_EVENT_MASK (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_UMASK_MASK | \ + NHMEX_PMON_CTL_EDGE_DET | \ + NHMEX_PMON_CTL_INVERT | \ + NHMEX_PMON_CTL_TRESH_MASK) + +/* NHM-EX Ubox */ +#define NHMEX_U_MSR_PMON_GLOBAL_CTL 0xc00 +#define NHMEX_U_MSR_PMON_CTR 0xc11 +#define NHMEX_U_MSR_PMON_EV_SEL 0xc10 + +#define NHMEX_U_PMON_GLOBAL_EN (1 << 0) +#define NHMEX_U_PMON_GLOBAL_PMI_CORE_SEL 0x0000001e +#define NHMEX_U_PMON_GLOBAL_EN_ALL (1 << 28) +#define NHMEX_U_PMON_GLOBAL_RST_ALL (1 << 29) +#define NHMEX_U_PMON_GLOBAL_FRZ_ALL (1 << 31) + +#define NHMEX_U_PMON_RAW_EVENT_MASK \ + (NHMEX_PMON_CTL_EV_SEL_MASK | \ + NHMEX_PMON_CTL_EDGE_DET) + +/* NHM-EX Cbox */ +#define NHMEX_C0_MSR_PMON_GLOBAL_CTL 0xd00 +#define NHMEX_C0_MSR_PMON_CTR0 0xd11 +#define NHMEX_C0_MSR_PMON_EV_SEL0 0xd10 +#define NHMEX_C_MSR_OFFSET 0x20 + +/* NHM-EX Bbox */ +#define NHMEX_B0_MSR_PMON_GLOBAL_CTL 0xc20 +#define NHMEX_B0_MSR_PMON_CTR0 0xc31 +#define NHMEX_B0_MSR_PMON_CTL0 0xc30 +#define NHMEX_B_MSR_OFFSET 0x40 +#define NHMEX_B0_MSR_MATCH 0xe45 +#define NHMEX_B0_MSR_MASK 0xe46 +#define NHMEX_B1_MSR_MATCH 0xe4d +#define NHMEX_B1_MSR_MASK 0xe4e + +#define NHMEX_B_PMON_CTL_EN (1 << 0) +#define NHMEX_B_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_B_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_B_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_B_PMON_CTR_SHIFT 6 +#define NHMEX_B_PMON_CTR_MASK \ + (0x3 << NHMEX_B_PMON_CTR_SHIFT) +#define NHMEX_B_PMON_RAW_EVENT_MASK \ + (NHMEX_B_PMON_CTL_EV_SEL_MASK | \ + NHMEX_B_PMON_CTR_MASK) + +/* NHM-EX Sbox */ +#define NHMEX_S0_MSR_PMON_GLOBAL_CTL 0xc40 +#define NHMEX_S0_MSR_PMON_CTR0 0xc51 +#define NHMEX_S0_MSR_PMON_CTL0 0xc50 +#define NHMEX_S_MSR_OFFSET 0x80 +#define NHMEX_S0_MSR_MM_CFG 0xe48 +#define NHMEX_S0_MSR_MATCH 0xe49 +#define NHMEX_S0_MSR_MASK 0xe4a +#define NHMEX_S1_MSR_MM_CFG 0xe58 +#define NHMEX_S1_MSR_MATCH 0xe59 +#define NHMEX_S1_MSR_MASK 0xe5a + +#define NHMEX_S_PMON_MM_CFG_EN (0x1ULL << 63) + +/* NHM-EX Mbox */ +#define NHMEX_M0_MSR_GLOBAL_CTL 0xca0 +#define NHMEX_M0_MSR_PMU_DSP 0xca5 +#define NHMEX_M0_MSR_PMU_ISS 0xca6 +#define NHMEX_M0_MSR_PMU_MAP 0xca7 +#define NHMEX_M0_MSR_PMU_MSC_THR 0xca8 +#define NHMEX_M0_MSR_PMU_PGT 0xca9 +#define NHMEX_M0_MSR_PMU_PLD 0xcaa +#define NHMEX_M0_MSR_PMU_ZDP_CTL_FVC 0xcab +#define NHMEX_M0_MSR_PMU_CTL0 0xcb0 +#define NHMEX_M0_MSR_PMU_CNT0 0xcb1 +#define NHMEX_M_MSR_OFFSET 0x40 +#define NHMEX_M0_MSR_PMU_MM_CFG 0xe54 +#define NHMEX_M1_MSR_PMU_MM_CFG 0xe5c + +#define NHMEX_M_PMON_MM_CFG_EN (1ULL << 63) +#define NHMEX_M_PMON_ADDR_MATCH_MASK 0x3ffffffffULL +#define NHMEX_M_PMON_ADDR_MASK_MASK 0x7ffffffULL +#define NHMEX_M_PMON_ADDR_MASK_SHIFT 34 + +#define NHMEX_M_PMON_CTL_EN (1 << 0) +#define NHMEX_M_PMON_CTL_PMI_EN (1 << 1) +#define NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT 2 +#define NHMEX_M_PMON_CTL_COUNT_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_COUNT_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT 4 +#define NHMEX_M_PMON_CTL_STORAGE_MODE_MASK \ + (0x3 << NHMEX_M_PMON_CTL_STORAGE_MODE_SHIFT) +#define NHMEX_M_PMON_CTL_WRAP_MODE (1 << 6) +#define NHMEX_M_PMON_CTL_FLAG_MODE (1 << 7) +#define NHMEX_M_PMON_CTL_INC_SEL_SHIFT 9 +#define NHMEX_M_PMON_CTL_INC_SEL_MASK \ + (0x1f << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT 19 +#define NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK \ + (0x7 << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) +#define NHMEX_M_PMON_RAW_EVENT_MASK \ + (NHMEX_M_PMON_CTL_COUNT_MODE_MASK | \ + NHMEX_M_PMON_CTL_STORAGE_MODE_MASK | \ + NHMEX_M_PMON_CTL_WRAP_MODE | \ + NHMEX_M_PMON_CTL_FLAG_MODE | \ + NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK) + +#define NHMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 11) - 1) | (1 << 23)) +#define NHMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (11 + 3 * (n))) + +#define WSMEX_M_PMON_ZDP_CTL_FVC_MASK (((1 << 12) - 1) | (1 << 24)) +#define WSMEX_M_PMON_ZDP_CTL_FVC_EVENT_MASK(n) (0x7 << (12 + 3 * (n))) + +/* + * use the 9~13 bits to select event If the 7th bit is not set, + * otherwise use the 19~21 bits to select event. + */ +#define MBOX_INC_SEL(x) ((x) << NHMEX_M_PMON_CTL_INC_SEL_SHIFT) +#define MBOX_SET_FLAG_SEL(x) (((x) << NHMEX_M_PMON_CTL_SET_FLAG_SEL_SHIFT) | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_MASK (NHMEX_M_PMON_CTL_INC_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_SET_FLAG_SEL_MASK (NHMEX_M_PMON_CTL_SET_FLAG_SEL_MASK | \ + NHMEX_M_PMON_CTL_FLAG_MODE) +#define MBOX_INC_SEL_EXTAR_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_INC_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_INC_SEL_MASK, (u64)-1, NHMEX_M_##r) +#define MBOX_SET_FLAG_SEL_EXTRA_REG(c, r) \ + EVENT_EXTRA_REG(MBOX_SET_FLAG_SEL(c), NHMEX_M0_MSR_PMU_##r, \ + MBOX_SET_FLAG_SEL_MASK, \ + (u64)-1, NHMEX_M_##r) + +/* NHM-EX Rbox */ +#define NHMEX_R_MSR_GLOBAL_CTL 0xe00 +#define NHMEX_R_MSR_PMON_CTL0 0xe10 +#define NHMEX_R_MSR_PMON_CNT0 0xe11 +#define NHMEX_R_MSR_OFFSET 0x20 + +#define NHMEX_R_MSR_PORTN_QLX_CFG(n) \ + ((n) < 4 ? (0xe0c + (n)) : (0xe2c + (n) - 4)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG0(n) (0xe04 + (n)) +#define NHMEX_R_MSR_PORTN_IPERF_CFG1(n) (0xe24 + (n)) +#define NHMEX_R_MSR_PORTN_XBR_OFFSET(n) \ + (((n) < 4 ? 0 : 0x10) + (n) * 4) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) \ + (0xe60 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET1_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET1_MM_CFG(n) + 2) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) \ + (0xe70 + NHMEX_R_MSR_PORTN_XBR_OFFSET(n)) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MATCH(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 1) +#define NHMEX_R_MSR_PORTN_XBR_SET2_MASK(n) \ + (NHMEX_R_MSR_PORTN_XBR_SET2_MM_CFG(n) + 2) + +#define NHMEX_R_PMON_CTL_EN (1 << 0) +#define NHMEX_R_PMON_CTL_EV_SEL_SHIFT 1 +#define NHMEX_R_PMON_CTL_EV_SEL_MASK \ + (0x1f << NHMEX_R_PMON_CTL_EV_SEL_SHIFT) +#define NHMEX_R_PMON_CTL_PMI_EN (1 << 6) +#define NHMEX_R_PMON_RAW_EVENT_MASK NHMEX_R_PMON_CTL_EV_SEL_MASK + +/* NHM-EX Wbox */ +#define NHMEX_W_MSR_GLOBAL_CTL 0xc80 +#define NHMEX_W_MSR_PMON_CNT0 0xc90 +#define NHMEX_W_MSR_PMON_EVT_SEL0 0xc91 +#define NHMEX_W_MSR_PMON_FIXED_CTR 0x394 +#define NHMEX_W_MSR_PMON_FIXED_CTL 0x395 + +#define NHMEX_W_PMON_GLOBAL_FIXED_EN (1ULL << 31) + +struct intel_uncore_ops; +struct intel_uncore_pmu; +struct intel_uncore_box; +struct uncore_event_desc; + +struct intel_uncore_type { + const char *name; + int num_counters; + int num_boxes; + int perf_ctr_bits; + int fixed_ctr_bits; + unsigned perf_ctr; + unsigned event_ctl; + unsigned event_mask; + unsigned fixed_ctr; + unsigned fixed_ctl; + unsigned box_ctl; + unsigned msr_offset; + unsigned num_shared_regs:8; + unsigned single_fixed:1; + unsigned pair_ctr_ctl:1; + unsigned *msr_offsets; + struct event_constraint unconstrainted; + struct event_constraint *constraints; + struct intel_uncore_pmu *pmus; + struct intel_uncore_ops *ops; + struct uncore_event_desc *event_descs; + const struct attribute_group *attr_groups[3]; +}; + +#define format_group attr_groups[0] + +struct intel_uncore_ops { + void (*init_box)(struct intel_uncore_box *); + void (*disable_box)(struct intel_uncore_box *); + void (*enable_box)(struct intel_uncore_box *); + void (*disable_event)(struct intel_uncore_box *, struct perf_event *); + void (*enable_event)(struct intel_uncore_box *, struct perf_event *); + u64 (*read_counter)(struct intel_uncore_box *, struct perf_event *); + int (*hw_config)(struct intel_uncore_box *, struct perf_event *); + struct event_constraint *(*get_constraint)(struct intel_uncore_box *, + struct perf_event *); + void (*put_constraint)(struct intel_uncore_box *, struct perf_event *); +}; + +struct intel_uncore_pmu { + struct pmu pmu; + char name[UNCORE_PMU_NAME_LEN]; + int pmu_idx; + int func_id; + struct intel_uncore_type *type; + struct intel_uncore_box ** __percpu box; + struct list_head box_list; +}; + +struct intel_uncore_extra_reg { + spinlock_t lock; + u64 config, config1, config2; + atomic_t ref; +}; + +struct intel_uncore_box { + int phys_id; + int n_active; /* number of active events */ + int n_events; + int cpu; /* cpu to collect events */ + unsigned long flags; + atomic_t refcnt; + struct perf_event *events[UNCORE_PMC_IDX_MAX]; + struct perf_event *event_list[UNCORE_PMC_IDX_MAX]; + unsigned long active_mask[BITS_TO_LONGS(UNCORE_PMC_IDX_MAX)]; + u64 tags[UNCORE_PMC_IDX_MAX]; + struct pci_dev *pci_dev; + struct intel_uncore_pmu *pmu; + struct hrtimer hrtimer; + struct list_head list; + struct intel_uncore_extra_reg shared_regs[0]; +}; + +#define UNCORE_BOX_FLAG_INITIATED 0 + +struct uncore_event_desc { + struct kobj_attribute attr; + const char *config; +}; + +#define INTEL_UNCORE_EVENT_DESC(_name, _config) \ +{ \ + .attr = __ATTR(_name, 0444, uncore_event_show, NULL), \ + .config = _config, \ +} + +#define DEFINE_UNCORE_FORMAT_ATTR(_var, _name, _format) \ +static ssize_t __uncore_##_var##_show(struct kobject *kobj, \ + struct kobj_attribute *attr, \ + char *page) \ +{ \ + BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE); \ + return sprintf(page, _format "\n"); \ +} \ +static struct kobj_attribute format_attr_##_var = \ + __ATTR(_name, 0444, __uncore_##_var##_show, NULL) + + +static ssize_t uncore_event_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct uncore_event_desc *event = + container_of(attr, struct uncore_event_desc, attr); + return sprintf(buf, "%s", event->config); +} + +static inline unsigned uncore_pci_box_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->box_ctl; +} + +static inline unsigned uncore_pci_fixed_ctl(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctl; +} + +static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr; +} + +static inline +unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) +{ + return idx * 4 + box->pmu->type->event_ctl; +} + +static inline +unsigned uncore_pci_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return idx * 8 + box->pmu->type->perf_ctr; +} + +static inline unsigned uncore_msr_box_offset(struct intel_uncore_box *box) +{ + struct intel_uncore_pmu *pmu = box->pmu; + return pmu->type->msr_offsets ? + pmu->type->msr_offsets[pmu->pmu_idx] : + pmu->type->msr_offset * pmu->pmu_idx; +} + +static inline unsigned uncore_msr_box_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->box_ctl) + return 0; + return box->pmu->type->box_ctl + uncore_msr_box_offset(box); +} + +static inline unsigned uncore_msr_fixed_ctl(struct intel_uncore_box *box) +{ + if (!box->pmu->type->fixed_ctl) + return 0; + return box->pmu->type->fixed_ctl + uncore_msr_box_offset(box); +} + +static inline unsigned uncore_msr_fixed_ctr(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_msr_event_ctl(struct intel_uncore_box *box, int idx) +{ + return box->pmu->type->event_ctl + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_msr_perf_ctr(struct intel_uncore_box *box, int idx) +{ + return box->pmu->type->perf_ctr + + (box->pmu->type->pair_ctr_ctl ? 2 * idx : idx) + + uncore_msr_box_offset(box); +} + +static inline +unsigned uncore_fixed_ctl(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctl(box); + else + return uncore_msr_fixed_ctl(box); +} + +static inline +unsigned uncore_fixed_ctr(struct intel_uncore_box *box) +{ + if (box->pci_dev) + return uncore_pci_fixed_ctr(box); + else + return uncore_msr_fixed_ctr(box); +} + +static inline +unsigned uncore_event_ctl(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_event_ctl(box, idx); + else + return uncore_msr_event_ctl(box, idx); +} + +static inline +unsigned uncore_perf_ctr(struct intel_uncore_box *box, int idx) +{ + if (box->pci_dev) + return uncore_pci_perf_ctr(box, idx); + else + return uncore_msr_perf_ctr(box, idx); +} + +static inline int uncore_perf_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->perf_ctr_bits; +} + +static inline int uncore_fixed_ctr_bits(struct intel_uncore_box *box) +{ + return box->pmu->type->fixed_ctr_bits; +} + +static inline int uncore_num_counters(struct intel_uncore_box *box) +{ + return box->pmu->type->num_counters; +} + +static inline void uncore_disable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->disable_box) + box->pmu->type->ops->disable_box(box); +} + +static inline void uncore_enable_box(struct intel_uncore_box *box) +{ + if (box->pmu->type->ops->enable_box) + box->pmu->type->ops->enable_box(box); +} + +static inline void uncore_disable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->disable_event(box, event); +} + +static inline void uncore_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + box->pmu->type->ops->enable_event(box, event); +} + +static inline u64 uncore_read_counter(struct intel_uncore_box *box, + struct perf_event *event) +{ + return box->pmu->type->ops->read_counter(box, event); +} + +static inline void uncore_box_init(struct intel_uncore_box *box) +{ + if (!test_and_set_bit(UNCORE_BOX_FLAG_INITIATED, &box->flags)) { + if (box->pmu->type->ops->init_box) + box->pmu->type->ops->init_box(box); + } +} + +static inline bool uncore_box_is_fake(struct intel_uncore_box *box) +{ + return (box->phys_id < 0); +} diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_p4.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_p4.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_p4.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_p4.c 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,1343 @@ +/* + * Netburst Performance Events (P4, old Xeon) + * + * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov + * Copyright (C) 2010 Intel Corporation, Lin Ming + * + * For licencing details see kernel-base/COPYING + */ +#include +#include +#include +#include + +#include "perf_event.h" + +#define P4_CNTR_LIMIT 3 +/* + * array indices: 0,1 - HT threads, used with HT enabled cpu + */ +struct p4_event_bind { + unsigned int opcode; /* Event code and ESCR selector */ + unsigned int escr_msr[2]; /* ESCR MSR for this event */ + unsigned int escr_emask; /* valid ESCR EventMask bits */ + unsigned int shared; /* event is shared across threads */ + char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ +}; + +struct p4_pebs_bind { + unsigned int metric_pebs; + unsigned int metric_vert; +}; + +/* it sets P4_PEBS_ENABLE_UOP_TAG as well */ +#define P4_GEN_PEBS_BIND(name, pebs, vert) \ + [P4_PEBS_METRIC__##name] = { \ + .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ + .metric_vert = vert, \ + } + +/* + * note we have P4_PEBS_ENABLE_UOP_TAG always set here + * + * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of + * event configuration to find out which values are to be + * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT + * resgisters + */ +static struct p4_pebs_bind p4_pebs_bind_map[] = { + P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), + P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), + P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), + P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), + P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), + P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), + P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), +}; + +/* + * Note that we don't use CCCR1 here, there is an + * exception for P4_BSQ_ALLOCATION but we just have + * no workaround + * + * consider this binding as resources which particular + * event may borrow, it doesn't contain EventMask, + * Tags and friends -- they are left to a caller + */ +static struct p4_event_bind p4_event_bind_map[] = { + [P4_EVENT_TC_DELIVER_MODE] = { + .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), + .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) | + P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID), + .shared = 1, + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_BPU_FETCH_REQUEST] = { + .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), + .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_ITLB_REFERENCE] = { + .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), + .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_MEMORY_CANCEL] = { + .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), + .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF), + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_MEMORY_COMPLETE] = { + .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), + .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) | + P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC), + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_LOAD_PORT_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), + .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD), + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_STORE_PORT_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), + .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST), + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_MOB_LOAD_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), + .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) | + P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_PAGE_WALK_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), + .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS), + .shared = 1, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BSQ_CACHE_REFERENCE] = { + .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), + .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_IOQ_ALLOCATION] = { + .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ + .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), + .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH), + .cntr = { {2, -1, -1}, {3, -1, -1} }, + }, + [P4_EVENT_FSB_DATA_ACTIVITY] = { + .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER), + .shared = 1, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ + .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), + .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2), + .cntr = { {0, -1, -1}, {1, -1, -1} }, + }, + [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ + .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), + .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2), + .cntr = { {2, -1, -1}, {3, -1, -1} }, + }, + [P4_EVENT_SSE_INPUT_ASSIST] = { + .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_PACKED_SP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_PACKED_DP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_SCALAR_SP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_SCALAR_DP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_64BIT_MMX_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_128BIT_MMX_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_X87_FP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_TC_MISC] = { + .opcode = P4_OPCODE(P4_EVENT_TC_MISC), + .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_GLOBAL_POWER_EVENTS] = { + .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING), + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_TC_MS_XFER] = { + .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), + .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_UOP_QUEUE_WRITES] = { + .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), + .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), + .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RETIRED_BRANCH_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), + .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT), + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RESOURCE_STALL] = { + .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), + .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_WC_BUFFER] = { + .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), + .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS), + .shared = 1, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_B2B_CYCLES] = { + .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BNR] = { + .opcode = P4_OPCODE(P4_EVENT_BNR), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_SNOOP] = { + .opcode = P4_OPCODE(P4_EVENT_SNOOP), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_RESPONSE] = { + .opcode = P4_OPCODE(P4_EVENT_RESPONSE), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .escr_emask = 0, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_FRONT_END_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_EXECUTION_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_REPLAY_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_INSTR_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_UOPS_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_UOP_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), + .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) | + P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_BRANCH_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) | + P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_MISPRED_BRANCH_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_X87_ASSIST] = { + .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) | + P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_MACHINE_CLEAR] = { + .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) | + P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_INSTR_COMPLETED] = { + .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .escr_emask = + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS), + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, +}; + +#define P4_GEN_CACHE_EVENT(event, bit, metric) \ + p4_config_pack_escr(P4_ESCR_EVENT(event) | \ + P4_ESCR_EMASK_BIT(event, bit)) | \ + p4_config_pack_cccr(metric | \ + P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) + +static __initconst const u64 p4_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__1stl_cache_load_miss_retired), + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__2ndl_cache_load_miss_retired), + }, +}, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__dtlb_load_miss_retired), + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__dtlb_store_miss_retired), + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, + P4_PEBS_METRIC__none), + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, + P4_PEBS_METRIC__none), + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * Because of Netburst being quite restricted in how many + * identical events may run simultaneously, we introduce event aliases, + * ie the different events which have the same functionality but + * utilize non-intersected resources (ESCR/CCCR/counter registers). + * + * This allow us to relax restrictions a bit and run two or more + * identical events together. + * + * Never set any custom internal bits such as P4_CONFIG_HT, + * P4_CONFIG_ALIASABLE or bits for P4_PEBS_METRIC, they are + * either up to date automatically or not applicable at all. + */ +struct p4_event_alias { + u64 original; + u64 alternative; +} p4_event_aliases[] = { + { + /* + * Non-halted cycles can be substituted with non-sleeping cycles (see + * Intel SDM Vol3b for details). We need this alias to be able + * to run nmi-watchdog and 'perf top' (or any other user space tool + * which is interested in running PERF_COUNT_HW_CPU_CYCLES) + * simultaneously. + */ + .original = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + .alternative = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_EXECUTION_EVENT) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3)| + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) | + P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3))| + p4_config_pack_cccr(P4_CCCR_THRESHOLD(15) | P4_CCCR_COMPLEMENT | + P4_CCCR_COMPARE), + }, +}; + +static u64 p4_get_alias_event(u64 config) +{ + u64 config_match; + int i; + + /* + * Only event with special mark is allowed, + * we're to be sure it didn't come as malformed + * RAW event. + */ + if (!(config & P4_CONFIG_ALIASABLE)) + return 0; + + config_match = config & P4_CONFIG_EVENT_ALIAS_MASK; + + for (i = 0; i < ARRAY_SIZE(p4_event_aliases); i++) { + if (config_match == p4_event_aliases[i].original) { + config_match = p4_event_aliases[i].alternative; + break; + } else if (config_match == p4_event_aliases[i].alternative) { + config_match = p4_event_aliases[i].original; + break; + } + } + + if (i >= ARRAY_SIZE(p4_event_aliases)) + return 0; + + return config_match | (config & P4_CONFIG_EVENT_ALIAS_IMMUTABLE_BITS); +} + +static u64 p4_general_events[PERF_COUNT_HW_MAX] = { + /* non-halted CPU clocks */ + [PERF_COUNT_HW_CPU_CYCLES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)) | + P4_CONFIG_ALIASABLE, + + /* + * retired instructions + * in a sake of simplicity we don't use the FSB tagging + */ + [PERF_COUNT_HW_INSTRUCTIONS] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)), + + /* cache hits */ + [PERF_COUNT_HW_CACHE_REFERENCES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)), + + /* cache misses */ + [PERF_COUNT_HW_CACHE_MISSES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)), + + /* branch instructions retired */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)), + + /* mispredicted branches retired */ + [PERF_COUNT_HW_BRANCH_MISSES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) | + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)), + + /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */ + [PERF_COUNT_HW_BUS_CYCLES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) | + p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE), +}; + +static struct p4_event_bind *p4_config_get_bind(u64 config) +{ + unsigned int evnt = p4_config_unpack_event(config); + struct p4_event_bind *bind = NULL; + + if (evnt < ARRAY_SIZE(p4_event_bind_map)) + bind = &p4_event_bind_map[evnt]; + + return bind; +} + +static u64 p4_pmu_event_map(int hw_event) +{ + struct p4_event_bind *bind; + unsigned int esel; + u64 config; + + config = p4_general_events[hw_event]; + bind = p4_config_get_bind(config); + esel = P4_OPCODE_ESEL(bind->opcode); + config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); + + return config; +} + +/* check cpu model specifics */ +static bool p4_event_match_cpu_model(unsigned int event_idx) +{ + /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */ + if (event_idx == P4_EVENT_INSTR_COMPLETED) { + if (boot_cpu_data.x86_model != 3 && + boot_cpu_data.x86_model != 4 && + boot_cpu_data.x86_model != 6) + return false; + } + + /* + * For info + * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2 + */ + + return true; +} + +static int p4_validate_raw_event(struct perf_event *event) +{ + unsigned int v, emask; + + /* User data may have out-of-bound event index */ + v = p4_config_unpack_event(event->attr.config); + if (v >= ARRAY_SIZE(p4_event_bind_map)) + return -EINVAL; + + /* It may be unsupported: */ + if (!p4_event_match_cpu_model(v)) + return -EINVAL; + + /* + * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as + * in Architectural Performance Monitoring, it means not + * on _which_ logical cpu to count but rather _when_, ie it + * depends on logical cpu state -- count event if one cpu active, + * none, both or any, so we just allow user to pass any value + * desired. + * + * In turn we always set Tx_OS/Tx_USR bits bound to logical + * cpu without their propagation to another cpu + */ + + /* + * if an event is shared across the logical threads + * the user needs special permissions to be able to use it + */ + if (p4_ht_active() && p4_event_bind_map[v].shared) { + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + /* ESCR EventMask bits may be invalid */ + emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK; + if (emask & ~p4_event_bind_map[v].escr_emask) + return -EINVAL; + + /* + * it may have some invalid PEBS bits + */ + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) + return -EINVAL; + + v = p4_config_unpack_metric(event->attr.config); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) + return -EINVAL; + + return 0; +} + +static int p4_hw_config(struct perf_event *event) +{ + int cpu = get_cpu(); + int rc = 0; + u32 escr, cccr; + + /* + * the reason we use cpu that early is that: if we get scheduled + * first time on the same cpu -- we will not need swap thread + * specific flags in config (and will save some cpu cycles) + */ + + cccr = p4_default_cccr_conf(cpu); + escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel, + event->attr.exclude_user); + event->hw.config = p4_config_pack_escr(escr) | + p4_config_pack_cccr(cccr); + + if (p4_ht_active() && p4_ht_thread(cpu)) + event->hw.config = p4_set_ht_bit(event->hw.config); + + if (event->attr.type == PERF_TYPE_RAW) { + struct p4_event_bind *bind; + unsigned int esel; + /* + * Clear bits we reserve to be managed by kernel itself + * and never allowed from a user space + */ + event->attr.config &= P4_CONFIG_MASK; + + rc = p4_validate_raw_event(event); + if (rc) + goto out; + + /* + * Note that for RAW events we allow user to use P4_CCCR_RESERVED + * bits since we keep additional info here (for cache events and etc) + */ + event->hw.config |= event->attr.config; + bind = p4_config_get_bind(event->attr.config); + if (!bind) { + rc = -EINVAL; + goto out; + } + esel = P4_OPCODE_ESEL(bind->opcode); + event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); + } + + rc = x86_setup_perfctr(event); +out: + put_cpu(); + return rc; +} + +static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) +{ + u64 v; + + /* an official way for overflow indication */ + rdmsrl(hwc->config_base, v); + if (v & P4_CCCR_OVF) { + wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF); + return 1; + } + + /* + * In some circumstances the overflow might issue an NMI but did + * not set P4_CCCR_OVF bit. Because a counter holds a negative value + * we simply check for high bit being set, if it's cleared it means + * the counter has reached zero value and continued counting before + * real NMI signal was received: + */ + rdmsrl(hwc->event_base, v); + if (!(v & ARCH_P4_UNFLAGGED_BIT)) + return 1; + + return 0; +} + +static void p4_pmu_disable_pebs(void) +{ + /* + * FIXME + * + * It's still allowed that two threads setup same cache + * events so we can't simply clear metrics until we knew + * no one is depending on us, so we need kind of counter + * for "ReplayEvent" users. + * + * What is more complex -- RAW events, if user (for some + * reason) will pass some cache event metric with improper + * event opcode -- it's fine from hardware point of view + * but completely nonsense from "meaning" of such action. + * + * So at moment let leave metrics turned on forever -- it's + * ok for now but need to be revisited! + * + * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + */ +} + +static inline void p4_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * If event gets disabled while counter is in overflowed + * state we need to clear P4_CCCR_OVF, otherwise interrupt get + * asserted again and again + */ + (void)checking_wrmsrl(hwc->config_base, + (u64)(p4_config_unpack_cccr(hwc->config)) & + ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); +} + +static void p4_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + if (!test_bit(idx, cpuc->active_mask)) + continue; + p4_pmu_disable_event(event); + } + + p4_pmu_disable_pebs(); +} + +/* configuration must be valid */ +static void p4_pmu_enable_pebs(u64 config) +{ + struct p4_pebs_bind *bind; + unsigned int idx; + + BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); + + idx = p4_config_unpack_metric(config); + if (idx == P4_PEBS_METRIC__none) + return; + + bind = &p4_pebs_bind_map[idx]; + + (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); +} + +static void p4_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int thread = p4_ht_config_thread(hwc->config); + u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); + unsigned int idx = p4_config_unpack_event(hwc->config); + struct p4_event_bind *bind; + u64 escr_addr, cccr; + + bind = &p4_event_bind_map[idx]; + escr_addr = (u64)bind->escr_msr[thread]; + + /* + * - we dont support cascaded counters yet + * - and counter 1 is broken (erratum) + */ + WARN_ON_ONCE(p4_is_event_cascaded(hwc->config)); + WARN_ON_ONCE(hwc->idx == 1); + + /* we need a real Event value */ + escr_conf &= ~P4_ESCR_EVENT_MASK; + escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode)); + + cccr = p4_config_unpack_cccr(hwc->config); + + /* + * it could be Cache event so we need to write metrics + * into additional MSRs + */ + p4_pmu_enable_pebs(hwc->config); + + (void)checking_wrmsrl(escr_addr, escr_conf); + (void)checking_wrmsrl(hwc->config_base, + (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); +} + +static void p4_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + if (!test_bit(idx, cpuc->active_mask)) + continue; + p4_pmu_enable_event(event); + } +} + +static int p4_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int overflow; + + if (!test_bit(idx, cpuc->active_mask)) { + /* catch in-flight IRQs */ + if (__test_and_clear_bit(idx, cpuc->running)) + handled++; + continue; + } + + event = cpuc->events[idx]; + hwc = &event->hw; + + WARN_ON_ONCE(hwc->idx != idx); + + /* it might be unflagged overflow */ + overflow = p4_pmu_clear_cccr_ovf(hwc); + + val = x86_perf_event_update(event); + if (!overflow && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) + continue; + + handled += overflow; + + /* event overflow for sure */ + perf_sample_data_init(&data, 0, hwc->last_period); + + if (!x86_perf_event_set_period(event)) + continue; + + + if (perf_event_overflow(event, &data, regs)) + x86_pmu_stop(event, 0); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + /* + * When dealing with the unmasking of the LVTPC on P4 perf hw, it has + * been observed that the OVF bit flag has to be cleared first _before_ + * the LVTPC can be unmasked. + * + * The reason is the NMI line will continue to be asserted while the OVF + * bit is set. This causes a second NMI to generate if the LVTPC is + * unmasked before the OVF bit is cleared, leading to unknown NMI + * messages. + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); + + return handled; +} + +/* + * swap thread specific fields according to a thread + * we are going to run on + */ +static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu) +{ + u32 escr, cccr; + + /* + * we either lucky and continue on same cpu or no HT support + */ + if (!p4_should_swap_ts(hwc->config, cpu)) + return; + + /* + * the event is migrated from an another logical + * cpu, so we need to swap thread specific flags + */ + + escr = p4_config_unpack_escr(hwc->config); + cccr = p4_config_unpack_cccr(hwc->config); + + if (p4_ht_thread(cpu)) { + cccr &= ~P4_CCCR_OVF_PMI_T0; + cccr |= P4_CCCR_OVF_PMI_T1; + if (escr & P4_ESCR_T0_OS) { + escr &= ~P4_ESCR_T0_OS; + escr |= P4_ESCR_T1_OS; + } + if (escr & P4_ESCR_T0_USR) { + escr &= ~P4_ESCR_T0_USR; + escr |= P4_ESCR_T1_USR; + } + hwc->config = p4_config_pack_escr(escr); + hwc->config |= p4_config_pack_cccr(cccr); + hwc->config |= P4_CONFIG_HT; + } else { + cccr &= ~P4_CCCR_OVF_PMI_T1; + cccr |= P4_CCCR_OVF_PMI_T0; + if (escr & P4_ESCR_T1_OS) { + escr &= ~P4_ESCR_T1_OS; + escr |= P4_ESCR_T0_OS; + } + if (escr & P4_ESCR_T1_USR) { + escr &= ~P4_ESCR_T1_USR; + escr |= P4_ESCR_T0_USR; + } + hwc->config = p4_config_pack_escr(escr); + hwc->config |= p4_config_pack_cccr(cccr); + hwc->config &= ~P4_CONFIG_HT; + } +} + +/* + * ESCR address hashing is tricky, ESCRs are not sequential + * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and + * the metric between any ESCRs is laid in range [0xa0,0xe1] + * + * so we make ~70% filled hashtable + */ + +#define P4_ESCR_MSR_BASE 0x000003a0 +#define P4_ESCR_MSR_MAX 0x000003e1 +#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1) +#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE) +#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr + +static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = { + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1), +}; + +static int p4_get_escr_idx(unsigned int addr) +{ + unsigned int idx = P4_ESCR_MSR_IDX(addr); + + if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || + !p4_escr_table[idx] || + p4_escr_table[idx] != addr)) { + WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr); + return -1; + } + + return idx; +} + +static int p4_next_cntr(int thread, unsigned long *used_mask, + struct p4_event_bind *bind) +{ + int i, j; + + for (i = 0; i < P4_CNTR_LIMIT; i++) { + j = bind->cntr[thread][i]; + if (j != -1 && !test_bit(j, used_mask)) + return j; + } + + return -1; +} + +static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +{ + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; + int cpu = smp_processor_id(); + struct hw_perf_event *hwc; + struct p4_event_bind *bind; + unsigned int i, thread, num; + int cntr_idx, escr_idx; + u64 config_alias; + int pass; + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); + + for (i = 0, num = n; i < n; i++, num--) { + + hwc = &cpuc->event_list[i]->hw; + thread = p4_ht_thread(cpu); + pass = 0; + +again: + /* + * It's possible to hit a circular lock + * between original and alternative events + * if both are scheduled already. + */ + if (pass > 2) + goto done; + + bind = p4_config_get_bind(hwc->config); + escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); + if (unlikely(escr_idx == -1)) + goto done; + + if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) { + cntr_idx = hwc->idx; + if (assign) + assign[i] = hwc->idx; + goto reserve; + } + + cntr_idx = p4_next_cntr(thread, used_mask, bind); + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) { + /* + * Check whether an event alias is still available. + */ + config_alias = p4_get_alias_event(hwc->config); + if (!config_alias) + goto done; + hwc->config = config_alias; + pass++; + goto again; + } + + p4_pmu_swap_config_ts(hwc, cpu); + if (assign) + assign[i] = cntr_idx; +reserve: + set_bit(cntr_idx, used_mask); + set_bit(escr_idx, escr_mask); + } + +done: + return num ? -ENOSPC : 0; +} + +PMU_FORMAT_ATTR(cccr, "config:0-31" ); +PMU_FORMAT_ATTR(escr, "config:32-62"); +PMU_FORMAT_ATTR(ht, "config:63" ); + +static struct attribute *intel_p4_formats_attr[] = { + &format_attr_cccr.attr, + &format_attr_escr.attr, + &format_attr_ht.attr, + NULL, +}; + +static __initconst const struct x86_pmu p4_pmu = { + .name = "Netburst P4/Xeon", + .handle_irq = p4_pmu_handle_irq, + .disable_all = p4_pmu_disable_all, + .enable_all = p4_pmu_enable_all, + .enable = p4_pmu_enable_event, + .disable = p4_pmu_disable_event, + .eventsel = MSR_P4_BPU_CCCR0, + .perfctr = MSR_P4_BPU_PERFCTR0, + .event_map = p4_pmu_event_map, + .max_events = ARRAY_SIZE(p4_general_events), + .get_event_constraints = x86_get_event_constraints, + /* + * IF HT disabled we may need to use all + * ARCH_P4_MAX_CCCR counters simulaneously + * though leave it restricted at moment assuming + * HT is on + */ + .num_counters = ARCH_P4_MAX_CCCR, + .apic = 1, + .cntval_bits = ARCH_P4_CNTRVAL_BITS, + .cntval_mask = ARCH_P4_CNTRVAL_MASK, + .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1, + .hw_config = p4_hw_config, + .schedule_events = p4_pmu_schedule_events, + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + .perfctr_second_write = 1, + + .format_attrs = intel_p4_formats_attr, +}; + +__init int p4_pmu_init(void) +{ + unsigned int low, high; + + /* If we get stripped -- indexing fails */ + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > INTEL_PMC_MAX_GENERIC); + + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (!(low & (1 << 7))) { + pr_cont("unsupported Netburst CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + memcpy(hw_cache_event_ids, p4_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Netburst events, "); + + x86_pmu = p4_pmu; + + return 0; +} diff -uprN linux-2.6.32/arch/x86/kernel/cpu/perf_event_p6.c linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_p6.c --- linux-2.6.32/arch/x86/kernel/cpu/perf_event_p6.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/perf_event_p6.c 2015-03-10 13:24:07.000000000 -0700 @@ -0,0 +1,164 @@ +#include +#include + +#include "perf_event.h" + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static struct event_constraint p6_event_constraints[] = +{ + INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ + INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ + INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */ + INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ + INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */ + INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */ + EVENT_CONSTRAINT_END +}; + +static void p6_pmu_disable_all(void) +{ + u64 val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void p6_pmu_enable_all(int added) +{ + unsigned long val; + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static inline void +p6_pmu_disable_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)checking_wrmsrl(hwc->config_base, val); +} + +static void p6_pmu_enable_event(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL_ENABLE; + + (void)checking_wrmsrl(hwc->config_base, val); +} + +PMU_FORMAT_ATTR(event, "config:0-7" ); +PMU_FORMAT_ATTR(umask, "config:8-15" ); +PMU_FORMAT_ATTR(edge, "config:18" ); +PMU_FORMAT_ATTR(pc, "config:19" ); +PMU_FORMAT_ATTR(inv, "config:23" ); +PMU_FORMAT_ATTR(cmask, "config:24-31" ); + +static struct attribute *intel_p6_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_pc.attr, + &format_attr_inv.attr, + &format_attr_cmask.attr, + NULL, +}; + +static __initconst const struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = x86_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_counters = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .cntval_bits = 32, + .cntval_mask = (1ULL << 32) - 1, + .get_event_constraints = x86_get_event_constraints, + .event_constraints = p6_event_constraints, + + .format_attrs = intel_p6_formats_attr, + .events_sysfs_show = intel_event_sysfs_show, + +}; + +__init int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + return 0; +} diff -uprN linux-2.6.32/arch/x86/kernel/cpu/powerflags.c linux-2.6.32.ovz/arch/x86/kernel/cpu/powerflags.c --- linux-2.6.32/arch/x86/kernel/cpu/powerflags.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/powerflags.c 2015-03-10 13:22:55.000000000 -0700 @@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = "100mhzsteps", "hwpstate", "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ }; diff -uprN linux-2.6.32/arch/x86/kernel/cpu/proc.c linux-2.6.32.ovz/arch/x86/kernel/cpu/proc.c --- linux-2.6.32/arch/x86/kernel/cpu/proc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/proc.c 2015-06-23 04:16:16.000000000 -0700 @@ -3,6 +3,7 @@ #include #include #include +#include /* * Get CPU information for use by the procfs. @@ -11,15 +12,12 @@ static void show_cpuinfo_core(struct seq unsigned int cpu) { #ifdef CONFIG_SMP - if (c->x86_max_cores * smp_num_siblings > 1) { - seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); - seq_printf(m, "siblings\t: %d\n", - cpumask_weight(cpu_core_mask(cpu))); - seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); - seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); - seq_printf(m, "apicid\t\t: %d\n", c->apicid); - seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); - } + seq_printf(m, "physical id\t: %d\n", c->phys_proc_id); + seq_printf(m, "siblings\t: %d\n", cpumask_weight(cpu_core_mask(cpu))); + seq_printf(m, "core id\t\t: %d\n", c->cpu_core_id); + seq_printf(m, "cpu cores\t: %d\n", c->booted_cores); + seq_printf(m, "apicid\t\t: %d\n", c->apicid); + seq_printf(m, "initial apicid\t: %d\n", c->initial_apicid); #endif } @@ -61,10 +59,30 @@ static void show_cpuinfo_misc(struct seq } #endif +extern void __do_cpuid_fault(unsigned int op, unsigned int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx); + +struct cpu_flags { + u32 val[RHNCAPINTS]; +}; + +static DEFINE_PER_CPU(struct cpu_flags, cpu_flags); + +static void init_cpu_flags(void *dummy) +{ + struct cpu_flags *flags; + + flags = &get_cpu_var(cpu_flags); + get_cpu_cap_masked(flags->val); + put_cpu_var(cpu_flags); +} + static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; unsigned int cpu = 0; + int is_super = ve_is_super(get_exec_env()); int i; #ifdef CONFIG_SMP @@ -85,12 +103,15 @@ static int show_cpuinfo(struct seq_file seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); + if (c->microcode) + seq_printf(m, "microcode\t: %u\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get(cpu); if (!freq) freq = cpu_khz; + freq = sched_cpulimit_scale_cpufreq(freq); seq_printf(m, "cpu MHz\t\t: %u.%03u\n", freq / 1000, (freq % 1000)); } @@ -103,8 +124,11 @@ static int show_cpuinfo(struct seq_file show_cpuinfo_misc(m, c); seq_printf(m, "flags\t\t:"); - for (i = 0; i < 32*NCAPINTS; i++) - if (cpu_has(c, i) && x86_cap_flags[i] != NULL) + for (i = 0; i < 32*RHNCAPINTS; i++) + if (x86_cap_flags[i] != NULL && + ((is_super && cpu_has(c, i)) || + (!is_super && test_bit(i, (unsigned long *) + &per_cpu(cpu_flags, cpu))))) seq_printf(m, " %s", x86_cap_flags[i]); seq_printf(m, "\nbogomips\t: %lu.%02lu\n", @@ -138,21 +162,28 @@ static int show_cpuinfo(struct seq_file return 0; } -static void *c_start(struct seq_file *m, loff_t *pos) +static void *__c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ *pos = cpumask_first(cpu_online_mask); else *pos = cpumask_next(*pos - 1, cpu_online_mask); - if ((*pos) < nr_cpu_ids) + if (__cpus_weight(cpu_online_mask, *pos) < num_online_vcpus()) return &cpu_data(*pos); return NULL; } +static void *c_start(struct seq_file *m, loff_t *pos) +{ + init_cpu_flags(NULL); + smp_call_function(init_cpu_flags, NULL, 1); + return __c_start(m, pos); +} + static void *c_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; - return c_start(m, pos); + return __c_start(m, pos); } static void c_stop(struct seq_file *m, void *v) diff -uprN linux-2.6.32/arch/x86/kernel/cpu/rdrand.c linux-2.6.32.ovz/arch/x86/kernel/cpu/rdrand.c --- linux-2.6.32/arch/x86/kernel/cpu/rdrand.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/rdrand.c 2015-03-10 13:23:10.000000000 -0700 @@ -0,0 +1,73 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu , + * H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include +#include +#include + +static int __init x86_rdrand_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_RDRAND); + return 1; +} +__setup("nordrand", x86_rdrand_setup); + +/* We can't use arch_get_random_long() here since alternatives haven't run */ +static inline int rdrand_long(unsigned long *v) +{ + int ok; + asm volatile("1: " RDRAND_LONG "\n\t" + "jc 2f\n\t" + "decl %0\n\t" + "jnz 1b\n\t" + "2:" + : "=r" (ok), "=a" (*v) + : "0" (RDRAND_RETRY_LOOPS)); + return ok; +} + +/* + * Force a reseed cycle; we are architecturally guaranteed a reseed + * after no more than 512 128-bit chunks of random data. This also + * acts as a test of the CPU capability. + */ +#define RESEED_LOOP ((512*128)/sizeof(unsigned long)) + +void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_RANDOM + unsigned long tmp; + int i, count, ok; + + if (!cpu_has(c, X86_FEATURE_RDRAND)) + return; /* Nothing to do */ + + for (count = i = 0; i < RESEED_LOOP; i++) { + ok = rdrand_long(&tmp); + if (ok) + count++; + } + + if (count != RESEED_LOOP) + clear_cpu_cap(c, X86_FEATURE_RDRAND); +#endif +} diff -uprN linux-2.6.32/arch/x86/kernel/cpu/sched.c linux-2.6.32.ovz/arch/x86/kernel/cpu/sched.c --- linux-2.6.32/arch/x86/kernel/cpu/sched.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/sched.c 2015-06-23 04:16:16.000000000 -0700 @@ -47,7 +47,7 @@ unsigned long arch_scale_smt_power(struc * aperf/mperf already includes the smt gain */ if (boot_cpu_has(X86_FEATURE_APERFMPERF)) - return SCHED_LOAD_SCALE; + return SCHED_POWER_SCALE; return default_scale_smt_power(sd, cpu); } diff -uprN linux-2.6.32/arch/x86/kernel/cpu/transmeta.c linux-2.6.32.ovz/arch/x86/kernel/cpu/transmeta.c --- linux-2.6.32/arch/x86/kernel/cpu/transmeta.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/transmeta.c 2015-06-23 04:16:16.000000000 -0700 @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include #include #include "cpu.h" @@ -26,7 +28,7 @@ static void __cpuinit init_transmeta(str early_init_transmeta(c); - display_cacheinfo(c); + cpu_detect_cache_sizes(c); /* Print CMS and CPU revision */ max = cpuid_eax(0x80860000); diff -uprN linux-2.6.32/arch/x86/kernel/cpu/vmware.c linux-2.6.32.ovz/arch/x86/kernel/cpu/vmware.c --- linux-2.6.32/arch/x86/kernel/cpu/vmware.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpu/vmware.c 2015-03-10 13:23:54.000000000 -0700 @@ -22,9 +22,11 @@ */ #include +#include #include -#include #include +#include +#include #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -32,6 +34,9 @@ #define VMWARE_PORT_CMD_GETVERSION 10 #define VMWARE_PORT_CMD_GETHZ 45 +#define VMWARE_PORT_CMD_GETVCPU_INFO 68 +#define VMWARE_PORT_CMD_LEGACY_X2APIC 3 +#define VMWARE_PORT_CMD_VCPU_RESERVED 31 #define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \ __asm__("inl (%%dx)" : \ @@ -50,7 +55,7 @@ static inline int __vmware_platform(void static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz; + uint64_t tsc_hz, lpj; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -61,10 +66,17 @@ static unsigned long vmware_get_tsc_khz( printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); + + if (!preset_lpj) { + lpj = ((u64)tsc_hz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; + } + return tsc_hz; } -void __init vmware_platform_setup(void) +static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -75,6 +87,8 @@ void __init vmware_platform_setup(void) else printk(KERN_WARNING "Failed to get TSC freq from the hypervisor\n"); + /* NMI watchdog uses perfctrs, but they are not present. */ + nmi_watchdog = NMI_NONE; } /* @@ -82,24 +96,21 @@ void __init vmware_platform_setup(void) * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ -int vmware_platform(void) +static bool __init vmware_platform(void) { if (cpu_has_hypervisor) { - unsigned int eax, ebx, ecx, edx; - char hyper_vendor_id[13]; + unsigned int eax; + unsigned int hyper_vendor_id[3]; - cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); - memcpy(hyper_vendor_id + 0, &ebx, 4); - memcpy(hyper_vendor_id + 4, &ecx, 4); - memcpy(hyper_vendor_id + 8, &edx, 4); - hyper_vendor_id[12] = '\0'; - if (!strcmp(hyper_vendor_id, "VMwareVMware")) - return 1; + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], + &hyper_vendor_id[1], &hyper_vendor_id[2]); + if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) + return true; } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) - return 1; + return true; - return 0; + return false; } /* @@ -114,8 +125,26 @@ int vmware_platform(void) * so that the kernel could just trust the hypervisor with providing a * reliable virtual TSC that is suitable for timekeeping. */ -void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) +static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); } + +/* Checks if hypervisor supports x2apic without VT-D interrupt remapping. */ +static bool __init vmware_legacy_x2apic_available(void) +{ + uint32_t eax, ebx, ecx, edx; + VMWARE_PORT(GETVCPU_INFO, eax, ebx, ecx, edx); + return (eax & (1 << VMWARE_PORT_CMD_VCPU_RESERVED)) == 0 && + (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; +} + +const __refconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, + .set_cpu_features = vmware_set_cpu_features, + .init_platform = vmware_platform_setup, + .x2apic_available = vmware_legacy_x2apic_available, +}; +EXPORT_SYMBOL(x86_hyper_vmware); diff -uprN linux-2.6.32/arch/x86/kernel/cpuid.c linux-2.6.32.ovz/arch/x86/kernel/cpuid.c --- linux-2.6.32/arch/x86/kernel/cpuid.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpuid.c 2015-03-10 13:24:00.000000000 -0700 @@ -40,6 +40,7 @@ #include #include #include +#include #include #include @@ -143,13 +144,57 @@ static const struct file_operations cpui .open = cpuid_open, }; +static ssize_t print_cpu_modalias(struct device *dev, + struct device_attribute *attr, + char *bufptr) +{ + int size = PAGE_SIZE; + int i, n; + char *buf = bufptr; + + n = snprintf(buf, size, "x86cpu:vendor:%04X:family:" + "%04X:model:%04X:feature:", + boot_cpu_data.x86_vendor, + boot_cpu_data.x86, + boot_cpu_data.x86_model); + size -= n; + buf += n; + size -= 2; + for (i = 0; i < NCAPINTS*32; i++) { + if (boot_cpu_has(i)) { + n = snprintf(buf, size, ",%04X", i); + if (n < 0) { + WARN(1, "x86 features overflow page\n"); + break; + } + size -= n; + buf += n; + } + } + *buf++ = ','; + *buf++ = '\n'; + return buf - bufptr; +} + +static DEVICE_ATTR(modalias, 0444, print_cpu_modalias, NULL); + static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; + int err; dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, "cpu%d", cpu); - return IS_ERR(dev) ? PTR_ERR(dev) : 0; + if (IS_ERR(dev)) + return PTR_ERR(dev); + + err = device_create_file(dev, &dev_attr_modalias); + if (err) { + /* keep device around on error. attribute is optional. */ + err = 0; + } + + return 0; } static void cpuid_device_destroy(int cpu) @@ -174,7 +219,7 @@ static int __cpuinit cpuid_class_cpu_cal cpuid_device_destroy(cpu); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __refdata cpuid_class_cpu_notifier = @@ -187,12 +232,24 @@ static char *cpuid_devnode(struct device return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } +static int cpuid_dev_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + char *buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (buf) { + print_cpu_modalias(NULL, NULL, buf); + add_uevent_var(env, "MODALIAS=%s", buf); + kfree(buf); + } + return 0; +} + static int __init cpuid_init(void) { int i, err = 0; i = 0; - if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { + if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, + "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); err = -EBUSY; @@ -204,6 +261,7 @@ static int __init cpuid_init(void) goto out_chrdev; } cpuid_class->devnode = cpuid_devnode; + cpuid_class->dev_uevent = cpuid_dev_uevent; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) @@ -221,7 +279,7 @@ out_class: } class_destroy(cpuid_class); out_chrdev: - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); out: return err; } @@ -233,7 +291,7 @@ static void __exit cpuid_exit(void) for_each_online_cpu(cpu) cpuid_device_destroy(cpu); class_destroy(cpuid_class); - unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); + __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); } diff -uprN linux-2.6.32/arch/x86/kernel/cpuid_fault.c linux-2.6.32.ovz/arch/x86/kernel/cpuid_fault.c --- linux-2.6.32/arch/x86/kernel/cpuid_fault.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/cpuid_fault.c 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,346 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct cpuid_override_entry { + unsigned int op; + unsigned int count; + bool has_count; + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; +}; + +#define MAX_CPUID_OVERRIDE_ENTRIES 16 + +struct cpuid_override_table { + struct rcu_head rcu_head; + int size; + struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES]; +}; + +static struct cpuid_override_table __rcu *cpuid_override __read_mostly; +static DEFINE_SPINLOCK(cpuid_override_lock); + +#define cpuid_override_active (!!rcu_access_pointer(cpuid_override)) + +void (*set_cpuid_faulting_cb)(bool enable); +static DEFINE_PER_CPU(bool, cpuid_faulting_enabled); + +void set_cpuid_faulting(bool enable) +{ + bool *enabled; + + if (!cpu_has_cpuid_faulting) + return; + if (!cpuid_override_active) + enable = false; + + enabled = &get_cpu_var(cpuid_faulting_enabled); + if (*enabled != enable) { + set_cpuid_faulting_cb(enable); + *enabled = enable; + } + put_cpu_var(cpuid_faulting_enabled); +} +EXPORT_SYMBOL(set_cpuid_faulting); + +static void cpuid_override_update(struct cpuid_override_table *new_table) +{ + struct cpuid_override_table *old_table; + + spin_lock(&cpuid_override_lock); + old_table = rcu_access_pointer(cpuid_override); + rcu_assign_pointer(cpuid_override, new_table); + spin_unlock(&cpuid_override_lock); + + if (old_table) + kfree_rcu(old_table, rcu_head); +} + +static bool cpuid_override_match(unsigned int op, unsigned int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + bool ret = false; + struct cpuid_override_table *t; + struct cpuid_override_entry *e; + int i; + + rcu_read_lock(); + t = rcu_dereference(cpuid_override); + if (!t) + goto out; + + for (i = 0; i < t->size; i++) { + e = &t->entries[i]; + if (e->op != op) + continue; + if (e->has_count && e->count != count) + continue; + *eax = e->eax; + *ebx = e->ebx; + *ecx = e->ecx; + *edx = e->edx; + ret = true; + break; + } +out: + rcu_read_unlock(); + return ret; +} + +static void __do_cpuid_fault(unsigned int op, unsigned int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* check if op is overridden */ + if (cpuid_override_match(op, count, eax, ebx, ecx, edx)) + return; + + /* fallback to real cpuid */ + cpuid_count(op, count, eax, ebx, ecx, edx); +} + +void do_cpuid_fault(struct pt_regs *regs) +{ + unsigned int eax, ebx, ecx, edx; + + __do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx); + + regs->ax = eax; + regs->bx = ebx; + regs->cx = ecx; + regs->dx = edx; +} + +void get_cpu_cap_masked(u32 *val) +{ + struct cpuinfo_x86 *c = ¤t_cpu_data; + struct cpuinfo_x86_rh *c_rh = ¤t_cpu_data_rh; + unsigned int eax, ebx, ecx, edx; + + memcpy(val, c->x86_capability, NCAPINTS * sizeof(u32)); + memcpy(val + NCAPINTS, c_rh->x86_capability, + RH_EXT_NCAPINTS * sizeof(u32)); + + /* + * Clear feature bits masked using cpuid masking/faulting. + */ + + if (c->cpuid_level >= 0x00000001) { + __do_cpuid_fault(0x00000001, 0, &eax, &ebx, &ecx, &edx); + val[4] &= ecx; + val[0] &= edx; + } + + if (c->cpuid_level >= 0x00000007) { + __do_cpuid_fault(0x00000007, 0, &eax, &ebx, &ecx, &edx); + val[9] &= ebx; + } + + if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000 && + c->extended_cpuid_level >= 0x80000001) { + __do_cpuid_fault(0x80000001, 0, &eax, &ebx, &ecx, &edx); + val[6] &= ecx; + val[1] &= edx; + } + + if (c->cpuid_level >= 0x0000000d) { + __do_cpuid_fault(0x0000000d, 1, &eax, &ebx, &ecx, &edx); + val[10] &= eax; + } +} +EXPORT_SYMBOL(get_cpu_cap_masked); + +/* + * CPUID override entry format: + * + * op[ count]: eax ebx ecx edx + * + * All values are in HEX. + */ +static int cpuid_override_entry_parse(const char *s, char **endp, + struct cpuid_override_entry *e) +{ + int taken; + char *end; + + if (sscanf(s, "%x %x: %x %x %x %x%n", + &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx, + &taken) == 6) + e->has_count = true; + else if (sscanf(s, "%x: %x %x %x %x%n", + &e->op, &e->eax, &e->ebx, &e->ecx, &e->edx, + &taken) == 5) + e->has_count = false; + else + return -EINVAL; + + end = (char *)s + taken; + if (*end) { + if (*end != '\n') + return -EINVAL; + ++end; + } + *endp = end; + return 0; +} + +static ssize_t cpuid_override_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct cpuid_override_table *t = NULL; + void *page = NULL; + char *s; + int err; + + err = -E2BIG; + if (count >= PAGE_SIZE) + goto out; + + err = -ENOMEM; + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (!t) + goto out; + + page = (void *)__get_free_page(GFP_KERNEL); + if (!page) + goto out; + + err = copy_from_user(page, buf, count); + if (err) + goto out; + + s = page; + s[count] = '\0'; + t->size = 0; + while (*(s = skip_spaces(s))) { + err = -E2BIG; + if (t->size == MAX_CPUID_OVERRIDE_ENTRIES) + goto out; + err = -EINVAL; + if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++])) + goto out; + } + if (!t->size) { + kfree(t); + t = NULL; + } + err = 0; +out: + free_page((unsigned long)page); + + if (!err) + cpuid_override_update(t); + else + kfree(t); + + return err ?: count; +} + +static void *__cpuid_override_seq_start(loff_t pos) +{ + struct cpuid_override_table *t = rcu_dereference(cpuid_override); + return t && pos < t->size ? &t->entries[pos] : NULL; +} + +static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos) +{ + rcu_read_lock(); + return __cpuid_override_seq_start(*ppos); +} + +static void *cpuid_override_seq_next(struct seq_file *seq, + void *v, loff_t *ppos) +{ + ++*ppos; + return __cpuid_override_seq_start(*ppos); +} + +static void cpuid_override_seq_stop(struct seq_file *s, void *v) +{ + rcu_read_unlock(); +} + +static int cpuid_override_seq_show(struct seq_file *s, void *v) +{ + struct cpuid_override_entry *e = v; + + seq_printf(s, "0x%08x", e->op); + if (e->has_count) + seq_printf(s, " 0x%08x", e->count); + seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n", + e->eax, e->ebx, e->ecx, e->edx); + return 0; +} + +static struct seq_operations cpuid_override_seq_ops = { + .start = cpuid_override_seq_start, + .next = cpuid_override_seq_next, + .stop = cpuid_override_seq_stop, + .show = cpuid_override_seq_show, +}; + +static int cpuid_override_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cpuid_override_seq_ops); +} + +static struct file_operations proc_cpuid_override_ops = { + .owner = THIS_MODULE, + .open = cpuid_override_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = cpuid_override_write, +}; + +static void disable_cpuid_faulting_fn(void *unused) +{ + set_cpuid_faulting(false); +} + +static int cpuid_faulting_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) { + /* + * Disable cpuid faulting before loading a new kernel by kexec + * in case the new kernel does not support this feature. + */ + cpuid_override_update(NULL); + on_each_cpu(disable_cpuid_faulting_fn, NULL, 1); + } + return NOTIFY_DONE; +} + +static struct notifier_block cpuid_faulting_reboot_nb = { + .notifier_call = cpuid_faulting_reboot_notify, +}; + +static int __init cpuid_fault_init(void) +{ + struct proc_dir_entry *proc; + + if (!cpu_has_cpuid_faulting) + return 0; + + register_reboot_notifier(&cpuid_faulting_reboot_nb); + + proc = proc_create("cpuid_override", 0644, proc_vz_dir, + &proc_cpuid_override_ops); + if (!proc) + return -ENOMEM; + + return 0; +} +module_init(cpuid_fault_init); diff -uprN linux-2.6.32/arch/x86/kernel/crash.c linux-2.6.32.ovz/arch/x86/kernel/crash.c --- linux-2.6.32/arch/x86/kernel/crash.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/crash.c 2015-03-10 13:23:52.000000000 -0700 @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include #include @@ -30,6 +32,29 @@ #include +int in_crash_kexec; + +/* + * This is used to VMCLEAR all VMCSs loaded on the + * processor. And when loading kvm_intel module, the + * callback function pointer will be assigned. + * + * protected by rcu. + */ +crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; +EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); + +static inline void cpu_crash_vmclear_loaded_vmcss(void) +{ + crash_vmclear_fn *do_vmclear_operation = NULL; + + rcu_read_lock(); + do_vmclear_operation = rcu_dereference(crash_vmclear_loaded_vmcss); + if (do_vmclear_operation) + do_vmclear_operation(); + rcu_read_unlock(); +} + #if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) static void kdump_nmi_callback(int cpu, struct die_args *args) @@ -49,6 +74,11 @@ static void kdump_nmi_callback(int cpu, #endif crash_save_cpu(regs, cpu); + /* + * VMCLEAR VMCSs loaded on all cpus if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Disable VMX or SVM if needed. * * We need to disable virtualization on all CPUs. @@ -63,6 +93,7 @@ static void kdump_nmi_callback(int cpu, static void kdump_nmi_shootdown_cpus(void) { + in_crash_kexec = 1; nmi_shootdown_cpus(kdump_nmi_callback); disable_local_APIC(); @@ -75,6 +106,7 @@ static void kdump_nmi_shootdown_cpus(voi } #endif +extern struct pci_dev *mcp55_rewrite; void native_machine_crash_shutdown(struct pt_regs *regs) { /* This function is only called after the system @@ -90,6 +122,11 @@ void native_machine_crash_shutdown(struc kdump_nmi_shootdown_cpus(); + /* + * VMCLEAR VMCSs loaded on this cpu if needed. + */ + cpu_crash_vmclear_loaded_vmcss(); + /* Booting kdump kernel with VMX or SVM enabled won't work, * because (among other limitations) we can't disable paging * with the virt flags. @@ -99,15 +136,31 @@ void native_machine_crash_shutdown(struc lapic_shutdown(); #if defined(CONFIG_X86_IO_APIC) - disable_IO_APIC(); + disable_IO_APIC(1); #endif + if (mcp55_rewrite) { + u32 cfg; + printk(KERN_CRIT "REWRITING MCP55 CFG REG\n"); + /* + * We have a mcp55 chip on board which has been + * flagged as only sending legacy interrupts + * to the BSP, and we are crashing on an AP + * This is obviously bad, and we need to + * fix it up. To do this we write to the + * flagged device, to the register at offset 0x74 + * and we make sure that bit 2 and bit 15 are clear + * This forces legacy interrupts to be broadcast + * to all cpus + */ + pci_read_config_dword(mcp55_rewrite, 0x74, &cfg); + cfg &= ~((1 << 2) | (1 << 15)); + printk(KERN_CRIT "CFG = %x\n", cfg); + pci_write_config_dword(mcp55_rewrite, 0x74, cfg); + } + #ifdef CONFIG_HPET_TIMER hpet_disable(); #endif -#ifdef CONFIG_X86_64 - pci_iommu_shutdown(); -#endif - crash_save_cpu(regs, safe_smp_processor_id()); } diff -uprN linux-2.6.32/arch/x86/kernel/crash_dump_64.c linux-2.6.32.ovz/arch/x86/kernel/crash_dump_64.c --- linux-2.6.32/arch/x86/kernel/crash_dump_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/crash_dump_64.c 2015-03-10 13:22:13.000000000 -0700 @@ -34,7 +34,7 @@ ssize_t copy_oldmem_page(unsigned long p if (!csize) return 0; - vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE); if (!vaddr) return -ENOMEM; @@ -46,6 +46,7 @@ ssize_t copy_oldmem_page(unsigned long p } else memcpy(buf, vaddr + offset, csize); + set_iounmap_nonlazy(); iounmap(vaddr); return csize; } diff -uprN linux-2.6.32/arch/x86/kernel/ds_selftest.c linux-2.6.32.ovz/arch/x86/kernel/ds_selftest.c --- linux-2.6.32/arch/x86/kernel/ds_selftest.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/ds_selftest.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,408 +0,0 @@ -/* - * Debug Store support - selftest - * - * - * Copyright (C) 2009 Intel Corporation. - * Markus Metzger , 2009 - */ - -#include "ds_selftest.h" - -#include -#include -#include -#include - -#include - - -#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ -#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ - -struct ds_selftest_bts_conf { - struct bts_tracer *tracer; - int error; - int (*suspend)(struct bts_tracer *); - int (*resume)(struct bts_tracer *); -}; - -static int ds_selftest_bts_consistency(const struct bts_trace *trace) -{ - int error = 0; - - if (!trace) { - printk(KERN_CONT "failed to access trace..."); - /* Bail out. Other tests are pointless. */ - return -1; - } - - if (!trace->read) { - printk(KERN_CONT "bts read not available..."); - error = -1; - } - - /* Do some sanity checks on the trace configuration. */ - if (!trace->ds.n) { - printk(KERN_CONT "empty bts buffer..."); - error = -1; - } - if (!trace->ds.size) { - printk(KERN_CONT "bad bts trace setup..."); - error = -1; - } - if (trace->ds.end != - (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { - printk(KERN_CONT "bad bts buffer setup..."); - error = -1; - } - /* - * We allow top in [begin; end], since its not clear when the - * overflow adjustment happens: after the increment or before the - * write. - */ - if ((trace->ds.top < trace->ds.begin) || - (trace->ds.end < trace->ds.top)) { - printk(KERN_CONT "bts top out of bounds..."); - error = -1; - } - - return error; -} - -static int ds_selftest_bts_read(struct bts_tracer *tracer, - const struct bts_trace *trace, - const void *from, const void *to) -{ - const unsigned char *at; - - /* - * Check a few things which do not belong to this test. - * They should be covered by other tests. - */ - if (!trace) - return -1; - - if (!trace->read) - return -1; - - if (to < from) - return -1; - - if (from < trace->ds.begin) - return -1; - - if (trace->ds.end < to) - return -1; - - if (!trace->ds.size) - return -1; - - /* Now to the test itself. */ - for (at = from; (void *)at < to; at += trace->ds.size) { - struct bts_struct bts; - unsigned long index; - int error; - - if (((void *)at - trace->ds.begin) % trace->ds.size) { - printk(KERN_CONT - "read from non-integer index..."); - return -1; - } - index = ((void *)at - trace->ds.begin) / trace->ds.size; - - memset(&bts, 0, sizeof(bts)); - error = trace->read(tracer, at, &bts); - if (error < 0) { - printk(KERN_CONT - "error reading bts trace at [%lu] (0x%p)...", - index, at); - return error; - } - - switch (bts.qualifier) { - case BTS_BRANCH: - break; - default: - printk(KERN_CONT - "unexpected bts entry %llu at [%lu] (0x%p)...", - bts.qualifier, index, at); - return -1; - } - } - - return 0; -} - -static void ds_selftest_bts_cpu(void *arg) -{ - struct ds_selftest_bts_conf *conf = arg; - const struct bts_trace *trace; - void *top; - - if (IS_ERR(conf->tracer)) { - conf->error = PTR_ERR(conf->tracer); - conf->tracer = NULL; - - printk(KERN_CONT - "initialization failed (err: %d)...", conf->error); - return; - } - - /* We should meanwhile have enough trace. */ - conf->error = conf->suspend(conf->tracer); - if (conf->error < 0) - return; - - /* Let's see if we can access the trace. */ - trace = ds_read_bts(conf->tracer); - - conf->error = ds_selftest_bts_consistency(trace); - if (conf->error < 0) - return; - - /* If everything went well, we should have a few trace entries. */ - if (trace->ds.top == trace->ds.begin) { - /* - * It is possible but highly unlikely that we got a - * buffer overflow and end up at exactly the same - * position we started from. - * Let's issue a warning, but continue. - */ - printk(KERN_CONT "no trace/overflow..."); - } - - /* Let's try to read the trace we collected. */ - conf->error = - ds_selftest_bts_read(conf->tracer, trace, - trace->ds.begin, trace->ds.top); - if (conf->error < 0) - return; - - /* - * Let's read the trace again. - * Since we suspended tracing, we should get the same result. - */ - top = trace->ds.top; - - trace = ds_read_bts(conf->tracer); - conf->error = ds_selftest_bts_consistency(trace); - if (conf->error < 0) - return; - - if (top != trace->ds.top) { - printk(KERN_CONT "suspend not working..."); - conf->error = -1; - return; - } - - /* Let's collect some more trace - see if resume is working. */ - conf->error = conf->resume(conf->tracer); - if (conf->error < 0) - return; - - conf->error = conf->suspend(conf->tracer); - if (conf->error < 0) - return; - - trace = ds_read_bts(conf->tracer); - - conf->error = ds_selftest_bts_consistency(trace); - if (conf->error < 0) - return; - - if (trace->ds.top == top) { - /* - * It is possible but highly unlikely that we got a - * buffer overflow and end up at exactly the same - * position we started from. - * Let's issue a warning and check the full trace. - */ - printk(KERN_CONT - "no resume progress/overflow..."); - - conf->error = - ds_selftest_bts_read(conf->tracer, trace, - trace->ds.begin, trace->ds.end); - } else if (trace->ds.top < top) { - /* - * We had a buffer overflow - the entire buffer should - * contain trace records. - */ - conf->error = - ds_selftest_bts_read(conf->tracer, trace, - trace->ds.begin, trace->ds.end); - } else { - /* - * It is quite likely that the buffer did not overflow. - * Let's just check the delta trace. - */ - conf->error = - ds_selftest_bts_read(conf->tracer, trace, top, - trace->ds.top); - } - if (conf->error < 0) - return; - - conf->error = 0; -} - -static int ds_suspend_bts_wrap(struct bts_tracer *tracer) -{ - ds_suspend_bts(tracer); - return 0; -} - -static int ds_resume_bts_wrap(struct bts_tracer *tracer) -{ - ds_resume_bts(tracer); - return 0; -} - -static void ds_release_bts_noirq_wrap(void *tracer) -{ - (void)ds_release_bts_noirq(tracer); -} - -static int ds_selftest_bts_bad_release_noirq(int cpu, - struct bts_tracer *tracer) -{ - int error = -EPERM; - - /* Try to release the tracer on the wrong cpu. */ - get_cpu(); - if (cpu != smp_processor_id()) { - error = ds_release_bts_noirq(tracer); - if (error != -EPERM) - printk(KERN_CONT "release on wrong cpu..."); - } - put_cpu(); - - return error ? 0 : -1; -} - -static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) -{ - struct bts_tracer *tracer; - int error; - - /* Try to request cpu tracing while task tracing is active. */ - tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, - (size_t)-1, BTS_KERNEL); - error = PTR_ERR(tracer); - if (!IS_ERR(tracer)) { - ds_release_bts(tracer); - error = 0; - } - - if (error != -EPERM) - printk(KERN_CONT "cpu/task tracing overlap..."); - - return error ? 0 : -1; -} - -static int ds_selftest_bts_bad_request_task(void *buffer) -{ - struct bts_tracer *tracer; - int error; - - /* Try to request cpu tracing while task tracing is active. */ - tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, - (size_t)-1, BTS_KERNEL); - error = PTR_ERR(tracer); - if (!IS_ERR(tracer)) { - error = 0; - ds_release_bts(tracer); - } - - if (error != -EPERM) - printk(KERN_CONT "task/cpu tracing overlap..."); - - return error ? 0 : -1; -} - -int ds_selftest_bts(void) -{ - struct ds_selftest_bts_conf conf; - unsigned char buffer[BUFFER_SIZE], *small_buffer; - unsigned long irq; - int cpu; - - printk(KERN_INFO "[ds] bts selftest..."); - conf.error = 0; - - small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; - - get_online_cpus(); - for_each_online_cpu(cpu) { - conf.suspend = ds_suspend_bts_wrap; - conf.resume = ds_resume_bts_wrap; - conf.tracer = - ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - ds_selftest_bts_cpu(&conf); - if (conf.error >= 0) - conf.error = ds_selftest_bts_bad_request_task(buffer); - ds_release_bts(conf.tracer); - if (conf.error < 0) - goto out; - - conf.suspend = ds_suspend_bts_noirq; - conf.resume = ds_resume_bts_noirq; - conf.tracer = - ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); - if (conf.error >= 0) { - conf.error = - ds_selftest_bts_bad_release_noirq(cpu, - conf.tracer); - /* We must not release the tracer twice. */ - if (conf.error < 0) - conf.tracer = NULL; - } - if (conf.error >= 0) - conf.error = ds_selftest_bts_bad_request_task(buffer); - smp_call_function_single(cpu, ds_release_bts_noirq_wrap, - conf.tracer, 1); - if (conf.error < 0) - goto out; - } - - conf.suspend = ds_suspend_bts_wrap; - conf.resume = ds_resume_bts_wrap; - conf.tracer = - ds_request_bts_task(current, buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - ds_selftest_bts_cpu(&conf); - if (conf.error >= 0) - conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); - ds_release_bts(conf.tracer); - if (conf.error < 0) - goto out; - - conf.suspend = ds_suspend_bts_noirq; - conf.resume = ds_resume_bts_noirq; - conf.tracer = - ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - local_irq_save(irq); - ds_selftest_bts_cpu(&conf); - if (conf.error >= 0) - conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); - ds_release_bts_noirq(conf.tracer); - local_irq_restore(irq); - if (conf.error < 0) - goto out; - - conf.error = 0; - out: - put_online_cpus(); - printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); - - return conf.error; -} - -int ds_selftest_pebs(void) -{ - return 0; -} diff -uprN linux-2.6.32/arch/x86/kernel/ds_selftest.h linux-2.6.32.ovz/arch/x86/kernel/ds_selftest.h --- linux-2.6.32/arch/x86/kernel/ds_selftest.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/ds_selftest.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,15 +0,0 @@ -/* - * Debug Store support - selftest - * - * - * Copyright (C) 2009 Intel Corporation. - * Markus Metzger , 2009 - */ - -#ifdef CONFIG_X86_DS_SELFTEST -extern int ds_selftest_bts(void); -extern int ds_selftest_pebs(void); -#else -static inline int ds_selftest_bts(void) { return 0; } -static inline int ds_selftest_pebs(void) { return 0; } -#endif diff -uprN linux-2.6.32/arch/x86/kernel/dumpstack_32.c linux-2.6.32.ovz/arch/x86/kernel/dumpstack_32.c --- linux-2.6.32/arch/x86/kernel/dumpstack_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/dumpstack_32.c 2015-06-23 04:16:16.000000000 -0700 @@ -10,55 +10,38 @@ #include #include #include +#include #include #include -#include #include -#include "dumpstack.h" - -/* Just a stub for now */ -int x86_is_stack_id(int id, char *name) -{ - return 0; -} -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, +void dump_trace(struct task_struct *task, + struct pt_regs *regs, unsigned long *stack, const struct stacktrace_ops *ops, void *data) { int graph = 0; + unsigned long bp; if (!task) task = current; if (!stack) { unsigned long dummy; + stack = &dummy; if (task && task != current) stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + bp = stack_frame(task, regs); for (;;) { struct thread_info *context; context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, - data, NULL, &graph); + bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph); stack = (unsigned long *)context->previous_esp; if (!stack) @@ -72,7 +55,7 @@ EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; @@ -94,7 +77,7 @@ show_stack_log_lvl(struct task_struct *t touch_nmi_watchdog(); } printk("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); } @@ -105,8 +88,9 @@ void show_registers(struct pt_regs *regs print_modules(); __show_regs(regs, 0); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", + printk(KERN_EMERG "Process %.*s (pid: %d, veid: %u, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), + task_veid(current), current_thread_info(), current, task_thread_info(current)); /* * When in-kernel, we also print out the stack and code at the @@ -119,8 +103,7 @@ void show_registers(struct pt_regs *regs u8 *ip; printk(KERN_EMERG "Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, - 0, KERN_EMERG); + show_stack_log_lvl(NULL, regs, ®s->sp, KERN_EMERG); printk(KERN_EMERG "Code: "); @@ -156,4 +139,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } - diff -uprN linux-2.6.32/arch/x86/kernel/dumpstack_64.c linux-2.6.32.ovz/arch/x86/kernel/dumpstack_64.c --- linux-2.6.32/arch/x86/kernel/dumpstack_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/dumpstack_64.c 2015-06-23 04:16:16.000000000 -0700 @@ -10,34 +10,29 @@ #include #include #include +#include #include #include -#include #include -#include "dumpstack.h" +#define N_EXCEPTION_STACKS_END \ + (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) static char x86_stack_ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", + [ DEBUG_STACK-1 ] = "#DB", + [ NMI_STACK-1 ] = "NMI", + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ MCE_STACK-1 ] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" + [ N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS_END ] = "#DB[?]" #endif - }; - -int x86_is_stack_id(int id, char *name) -{ - return x86_stack_ids[id - 1] == name; -} +}; static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) + unsigned *usedp, char **idp) { unsigned k; @@ -101,6 +96,41 @@ static unsigned long *in_exception_stack return NULL; } +static inline int +in_irq_stack(unsigned long *stack, unsigned long *irq_stack, + unsigned long *irq_stack_end) +{ + return (stack >= irq_stack && stack < irq_stack_end); +} + +/* + * We are returning from the irq stack and go to the previous one. + * If the previous stack is also in the irq stack, then bp in the first + * frame of the irq stack points to the previous, interrupted one. + * Otherwise we have another level of indirection: We first save + * the bp of the previous stack, then we switch the stack to the irq one + * and save a new bp that links to the previous one. + * (See save_args()) + */ +static inline unsigned long +fixup_bp_irq_link(unsigned long bp, unsigned long *stack, + unsigned long *irq_stack, unsigned long *irq_stack_end) +{ +#ifdef CONFIG_FRAME_POINTER + struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long next; + + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) { + if (!probe_kernel_address(&frame->next_frame, next)) + return next; + else + WARN_ONCE(1, "Perf: bad frame pointer = %p in " + "callchain\n", &frame->next_frame); + } +#endif + return bp; +} + /* * x86-64 can have up to three kernel stacks: * process stack @@ -109,7 +139,7 @@ static unsigned long *in_exception_stack */ void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, + unsigned long *stack, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); @@ -118,6 +148,7 @@ void dump_trace(struct task_struct *task unsigned used = 0; struct thread_info *tinfo; int graph = 0; + unsigned long bp; if (!task) task = current; @@ -129,18 +160,7 @@ void dump_trace(struct task_struct *task stack = (unsigned long *)task->thread.sp; } -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - + bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -157,8 +177,8 @@ void dump_trace(struct task_struct *task if (ops->stack(data, id) < 0) break; - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, + data, estack_end, &graph); ops->stack(data, ""); /* * We link to the next stack via the @@ -173,10 +193,10 @@ void dump_trace(struct task_struct *task irq_stack = irq_stack_end - (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - if (stack >= irq_stack && stack < irq_stack_end) { + if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; - bp = print_context_stack(tinfo, stack, bp, + bp = ops->walk_stack(tinfo, stack, bp, ops, data, irq_stack_end, &graph); /* * We link to the next stack (which would be @@ -184,6 +204,8 @@ void dump_trace(struct task_struct *task * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); + bp = fixup_bp_irq_link(bp, stack, irq_stack, + irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; @@ -195,28 +217,31 @@ void dump_trace(struct task_struct *task /* * This handles the process stack: */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, char *log_lvl) { + unsigned long *irq_stack_end; + unsigned long *irq_stack; unsigned long *stack; + int cpu; int i; - const int cpu = smp_processor_id(); - unsigned long *irq_stack_end = - (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - unsigned long *irq_stack = - (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + + preempt_disable(); + cpu = smp_processor_id(); + + irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); /* - * debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu. + * Debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu: */ - if (sp == NULL) { if (task) sp = (unsigned long *)task->thread.sp; @@ -240,8 +265,10 @@ show_stack_log_lvl(struct task_struct *t printk(" %016lx", *stack++); touch_nmi_watchdog(); } + preempt_enable(); + printk("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); } void show_registers(struct pt_regs *regs) @@ -253,9 +280,11 @@ void show_registers(struct pt_regs *regs sp = regs->sp; printk("CPU %d ", cpu); + print_modules(); __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); + printk("Process %s (pid: %d, veid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_veid(cur), + task_thread_info(cur), cur); /* * When in-kernel, we also print out the stack and code at the @@ -267,11 +296,11 @@ void show_registers(struct pt_regs *regs unsigned char c; u8 *ip; - printk(KERN_EMERG "Stack:\n"); + printk(KERN_DEFAULT "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, KERN_EMERG); + KERN_DEFAULT); - printk(KERN_EMERG "Code: "); + printk(KERN_DEFAULT "Code: "); ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { @@ -303,4 +332,3 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } - diff -uprN linux-2.6.32/arch/x86/kernel/dumpstack.c linux-2.6.32.ovz/arch/x86/kernel/dumpstack.c --- linux-2.6.32/arch/x86/kernel/dumpstack.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/dumpstack.c 2015-06-23 04:16:16.000000000 -0700 @@ -18,7 +18,6 @@ #include -#include "dumpstack.h" int panic_on_unrecovered_nmi; int panic_on_io_nmi; @@ -38,13 +37,16 @@ print_ftrace_graph_addr(unsigned long ad const struct stacktrace_ops *ops, struct thread_info *tinfo, int *graph) { - struct task_struct *task = tinfo->task; + struct task_struct *task; unsigned long ret_addr; - int index = task->curr_ret_stack; + int index; if (addr != (unsigned long)return_to_handler) return; + task = tinfo->task; + index = task->curr_ret_stack; + if (!task->ret_stack || index < *graph) return; @@ -109,6 +111,32 @@ print_context_stack(struct thread_info * } return bp; } +EXPORT_SYMBOL_GPL(print_context_stack); + +unsigned long +print_context_stack_bp(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end, int *graph) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + unsigned long *ret_addr = &frame->return_address; + + while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { + unsigned long addr = *ret_addr; + + if (!__kernel_text_address(addr)) + break; + + ops->address(data, addr, 1); + frame = frame->next_frame; + ret_addr = &frame->return_address; + print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + } + + return (unsigned long)frame; +} +EXPORT_SYMBOL_GPL(print_context_stack_bp); static void @@ -141,29 +169,30 @@ static void print_trace_address(void *da } static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, + .walk_stack = print_context_stack, }; void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, char *log_lvl) { printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); + dump_trace(task, regs, stack, &print_trace_ops, log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) + unsigned long *stack) { - show_trace_log_lvl(task, regs, stack, bp, ""); + show_trace_log_lvl(task, regs, stack, ""); } void show_stack(struct task_struct *task, unsigned long *sp) { - show_stack_log_lvl(task, NULL, sp, 0, ""); + show_stack_log_lvl(task, NULL, sp, ""); } /* @@ -171,20 +200,15 @@ void show_stack(struct task_struct *task */ void dump_stack(void) { - unsigned long bp = 0; unsigned long stack; -#ifdef CONFIG_FRAME_POINTER - if (!bp) - get_bp(bp); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), + printk("Pid: %d, comm: %.20s veid: %u %s %s %.*s\n", + current->pid, current->comm, + task_veid(current), print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); + show_trace(NULL, NULL, &stack); } EXPORT_SYMBOL(dump_stack); @@ -197,11 +221,6 @@ unsigned __kprobes long oops_begin(void) int cpu; unsigned long flags; - /* notify the hw-branch tracer so it may disable tracing and - add the last trace to the trace buffer - - the earlier this happens, the more useful the trace. */ - trace_hw_branch_oops(); - oops_enter(); /* racy, but better than risking deadlock. */ @@ -219,6 +238,7 @@ unsigned __kprobes long oops_begin(void) bust_spinlocks(1); return flags; } +EXPORT_SYMBOL_GPL(oops_begin); void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { @@ -250,7 +270,8 @@ int __kprobes __die(const char *str, str unsigned short ss; unsigned long sp; #endif - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); + printk(KERN_DEFAULT + "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif @@ -268,11 +289,12 @@ int __kprobes __die(const char *str, str show_registers(regs); #ifdef CONFIG_X86_32 - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { + if (user_mode_vm(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; + } else { + sp = kernel_stack_pointer(regs); + savesegment(ss, ss); } printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); print_symbol("%s", regs->ip); @@ -320,6 +342,7 @@ die_nmi(char *str, struct pt_regs *regs, printk(" on CPU%d, ip %08lx, registers:\n", smp_processor_id(), regs->ip); show_registers(regs); + nmi_show_regs(regs, 1); oops_end(flags, regs, 0); if (do_panic || panic_on_oops) panic("Non maskable interrupt"); diff -uprN linux-2.6.32/arch/x86/kernel/dumpstack.h linux-2.6.32.ovz/arch/x86/kernel/dumpstack.h --- linux-2.6.32/arch/x86/kernel/dumpstack.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/dumpstack.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,38 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - */ - -#ifndef DUMPSTACK_H -#define DUMPSTACK_H - -#ifdef CONFIG_X86_32 -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) -#else -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) -#endif - -extern unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); - -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); - -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); - -extern unsigned int code_bytes; - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; -#endif diff -uprN linux-2.6.32/arch/x86/kernel/e820.c linux-2.6.32.ovz/arch/x86/kernel/e820.c --- linux-2.6.32/arch/x86/kernel/e820.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/e820.c 2015-03-10 13:22:59.000000000 -0700 @@ -33,9 +33,9 @@ * and that is also registered with modifications in the kernel resource tree * with the iomem_resource as parent. * - * The e820_saved is directly saved after the BIOS-provided memory map is - * copied. It doesn't get modified afterwards. It's registered for the - * /sys/firmware/memmap interface. + * The e820_saved is saved after the BIOS-provided memory map is copied as + * well as the optional add_efi_memmap entries are processed. It doesn't get + * modified afterwards. It's registered for the /sys/firmware/memmap interface. * * That memory map is not modified and is used as base for kexec. The kexec'd * kernel should get the same memory map as the firmware provides. Then the @@ -131,6 +131,11 @@ void __init e820_add_region(u64 start, u __e820_add_region(&e820, start, size, type); } +void __init e820_saved_add_region(u64 start, u64 size, int type) +{ + __e820_add_region(&e820_saved, start, size, type); +} + static void __init e820_print_type(u32 type) { switch (type) { @@ -517,11 +522,19 @@ u64 __init e820_remove_range(u64 start, int checktype) { int i; + u64 end; u64 real_removed_size = 0; if (size > (ULLONG_MAX - start)) size = ULLONG_MAX - start; + end = start + size; + printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ", + (unsigned long long) start, + (unsigned long long) end); + e820_print_type(old_type); + printk(KERN_CONT "\n"); + for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; u64 final_start, final_end; @@ -1097,7 +1110,7 @@ u64 __init early_reserve_e820(u64 startt #ifdef CONFIG_X86_32 # ifdef CONFIG_X86_PAE -# define MAX_ARCH_PFN (1ULL<<(36-PAGE_SHIFT)) +# define MAX_ARCH_PFN (1ULL<<(34-PAGE_SHIFT)) # else # define MAX_ARCH_PFN (1ULL<<(32-PAGE_SHIFT)) # endif @@ -1350,7 +1363,9 @@ void __init e820_reserve_resources(void) * pci device BAR resource and insert them later in * pcibios_resource_survey() */ - if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) { + if ((e820.map[i].type != E820_RESERVED + && e820.map[i].type != E820_NVS) || + res->start < (1ULL<<20)) { res->flags |= IORESOURCE_BUSY; insert_resource(&iomem_resource, res); } diff -uprN linux-2.6.32/arch/x86/kernel/early_printk.c linux-2.6.32.ovz/arch/x86/kernel/early_printk.c --- linux-2.6.32/arch/x86/kernel/early_printk.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/early_printk.c 2015-03-10 13:24:10.000000000 -0700 @@ -16,6 +16,8 @@ #include #include #include +#include +#include /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -231,6 +233,11 @@ static int __init setup_early_printk(cha if (!strncmp(buf, "xen", 3)) early_console_register(&xenboot_console, keep); #endif +#ifdef CONFIG_EARLY_PRINTK_EFI + if (!strncmp(buf, "efi", 3)) + early_console_register(&early_efi_console, keep); +#endif + buf++; } return 0; diff -uprN linux-2.6.32/arch/x86/kernel/efi_32.c linux-2.6.32.ovz/arch/x86/kernel/efi_32.c --- linux-2.6.32/arch/x86/kernel/efi_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/efi_32.c 2015-03-10 13:21:44.000000000 -0700 @@ -110,3 +110,7 @@ void efi_call_phys_epilog(void) local_irq_restore(efi_rt_eflags); } + +void efi_call_phys_prelog_in_physmode(void) { /* Not supported */ } +void efi_call_phys_epilog_in_physmode(void) { /* Not supported */ } + diff -uprN linux-2.6.32/arch/x86/kernel/efi_64.c linux-2.6.32.ovz/arch/x86/kernel/efi_64.c --- linux-2.6.32/arch/x86/kernel/efi_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/efi_64.c 2015-03-10 13:24:11.000000000 -0700 @@ -24,9 +24,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -38,25 +40,12 @@ #include #include -static pgd_t save_pgd __initdata; -static unsigned long efi_flags __initdata; +static pgd_t *save_pgd __initdata; +static DEFINE_PER_CPU(unsigned long, efi_flags); +static DEFINE_PER_CPU(unsigned long, save_cr3); +static pgd_t efi_pgd[PTRS_PER_PGD] __page_aligned_bss; -static void __init early_mapping_set_exec(unsigned long start, - unsigned long end, - int executable) -{ - unsigned long num_pages; - - start &= PMD_MASK; - end = (end + PMD_SIZE - 1) & PMD_MASK; - num_pages = (end - start) >> PAGE_SHIFT; - if (executable) - set_memory_x((unsigned long)__va(start), num_pages); - else - set_memory_nx((unsigned long)__va(start), num_pages); -} - -static void __init early_runtime_code_mapping_set_exec(int executable) +static void __init early_code_mapping_set_exec(int executable) { efi_memory_desc_t *md; void *p; @@ -67,10 +56,9 @@ static void __init early_runtime_code_ma /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - if (md->type == EFI_RUNTIME_SERVICES_CODE) { - unsigned long end; - end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); - early_mapping_set_exec(md->phys_addr, end, executable); + if (md->type == EFI_RUNTIME_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_CODE) { + efi_set_executable(md, executable); } } } @@ -78,12 +66,20 @@ static void __init early_runtime_code_ma void __init efi_call_phys_prelog(void) { unsigned long vaddress; + int pgd; + int n_pgds; - early_runtime_code_mapping_set_exec(1); - local_irq_save(efi_flags); - vaddress = (unsigned long)__va(0x0UL); - save_pgd = *pgd_offset_k(0x0UL); - set_pgd(pgd_offset_k(0x0UL), *pgd_offset_k(vaddress)); + early_code_mapping_set_exec(1); + local_irq_save(get_cpu_var(efi_flags)); + + n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); + save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); + + for (pgd = 0; pgd < n_pgds; pgd++) { + save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE); + vaddress = (unsigned long)__va(pgd * PGDIR_SIZE); + set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress)); + } __flush_tlb_all(); } @@ -92,10 +88,27 @@ void __init efi_call_phys_epilog(void) /* * After the lock is released, the original page table is restored. */ - set_pgd(pgd_offset_k(0x0UL), save_pgd); + int pgd; + int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE); + for (pgd = 0; pgd < n_pgds; pgd++) + set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]); + kfree(save_pgd); __flush_tlb_all(); - local_irq_restore(efi_flags); - early_runtime_code_mapping_set_exec(0); + local_irq_restore(get_cpu_var(efi_flags)); + early_code_mapping_set_exec(0); +} + +void efi_call_phys_prelog_in_physmode(void) +{ + local_irq_save(get_cpu_var(efi_flags)); + get_cpu_var(save_cr3)= read_cr3(); + write_cr3(virt_to_phys(efi_pgd)); +} + +void efi_call_phys_epilog_in_physmode(void) +{ + write_cr3(get_cpu_var(save_cr3)); + local_irq_restore(get_cpu_var(efi_flags)); } void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, @@ -107,8 +120,149 @@ void __iomem *__init efi_ioremap(unsigne return ioremap(phys_addr, size); last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); - if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) - return NULL; + if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) { + unsigned long top = last_map_pfn << PAGE_SHIFT; + efi_ioremap(top, size - (top - phys_addr), type); + } return (void __iomem *)__va(phys_addr); } + +static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +{ + if (pgd_none(*pgd)) { + pud_t *pud = (pud_t *)get_zeroed_page(GFP_ATOMIC); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + if (pud != pud_offset(pgd, 0)) + printk(KERN_ERR "EFI PAGETABLE BUG #00! %p <-> %p\n", + pud, pud_offset(pgd, 0)); + } + return pud_offset(pgd, vaddr); +} + +static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) +{ + if (pud_none(*pud)) { + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_ATOMIC); + set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); + if (pmd != pmd_offset(pud, 0)) + printk(KERN_ERR "EFI PAGETABLE BUG #01! %p <-> %p\n", + pmd, pmd_offset(pud, 0)); + } + return pmd_offset(pud, vaddr); +} + +static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) +{ + if (pmd_none(*pmd)) { + pte_t *pte = (pte_t *)get_zeroed_page(GFP_ATOMIC); + set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))); + if (pte != pte_offset_kernel(pmd, 0)) + printk(KERN_ERR "EFI PAGETABLE BUG #02!\n"); + } + return pte_offset_kernel(pmd, vaddr); +} + +static int __init dell_efi_quirk(const struct dmi_system_id *d) +{ + u64 vaddr; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* + * Some UEFI run time implementations (DELL) require physical page + * zero to be mapped. This location is used during EfiResetSystem + * when ResetType is EfiResetWarm (reboot=warm). UEFI writes to + * a BIOS physical address of 0x472 for the reboot mode. The reason + * for this hasn't been revealed by the UEFI developers. + */ + printk(KERN_INFO + "%s series board detected. Applying quirk for" + " page 0 UEFI firmware access.\n", d->ident); + vaddr = 0UL; + pgd = efi_pgd + pgd_index(vaddr); + pud = fill_pud(pgd, vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); + set_pte(pte, pfn_pte(0UL, PAGE_KERNEL)); + return 0; +} + +static struct dmi_system_id __initdata efi_quirk_table[] = { + { /* Dell systems in EFI mode need special handling for resets */ + .callback = dell_efi_quirk, + .ident = "Dell PowerEdge", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"), + }, + }, + { } +}; + +#define EFI_CALLBACK_PRI 3 /* lower than node and SLAB */ +#ifdef CONFIG_MEMORY_HOTPLUG +static int efi_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_GOING_ONLINE: /* fall through */ + case MEM_GOING_OFFLINE: + clone_pgd_range(efi_pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); + break; + default: + break; + } + + return NOTIFY_OK; +} +#endif + +void __init efi_pagetable_init(void) +{ + efi_memory_desc_t *md; + unsigned long size; + u64 start_pfn, end_pfn, pfn, vaddr; + void *p; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + memset(efi_pgd, 0, sizeof(efi_pgd)); + + dmi_check_system(efi_quirk_table); + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + md = p; + if (!(md->attribute & EFI_MEMORY_RUNTIME)) + continue; + + start_pfn = md->phys_addr >> PAGE_SHIFT; + size = md->num_pages << EFI_PAGE_SHIFT; + end_pfn = PFN_UP(md->phys_addr + size); + + for (pfn = start_pfn; pfn <= end_pfn; pfn++) { + unsigned long val; + + vaddr = pfn << PAGE_SHIFT; + pgd = efi_pgd + pgd_index(vaddr); + pud = fill_pud(pgd, vaddr); + pmd = fill_pmd(pud, vaddr); + pte = fill_pte(pmd, vaddr); + if (md->type == EFI_RUNTIME_SERVICES_CODE) + val = __PAGE_KERNEL_EXEC; + else + val = __PAGE_KERNEL; + + if (!(md->attribute & EFI_MEMORY_WB)) + val |= _PAGE_CACHE_UC_MINUS; + set_pte(pte, pfn_pte(pfn, __pgprot(val))); + } + } + clone_pgd_range(efi_pgd + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); + hotplug_memory_notifier(efi_memory_callback, EFI_CALLBACK_PRI); +} diff -uprN linux-2.6.32/arch/x86/kernel/efi.c linux-2.6.32.ovz/arch/x86/kernel/efi.c --- linux-2.6.32/arch/x86/kernel/efi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/efi.c 2015-03-10 13:24:14.000000000 -0700 @@ -50,13 +50,24 @@ int efi_enabled; EXPORT_SYMBOL(efi_enabled); -struct efi efi; +struct efi __read_mostly efi = { + .mps = EFI_INVALID_TABLE_ADDR, + .acpi = EFI_INVALID_TABLE_ADDR, + .acpi20 = EFI_INVALID_TABLE_ADDR, + .smbios = EFI_INVALID_TABLE_ADDR, + .sal_systab = EFI_INVALID_TABLE_ADDR, + .boot_info = EFI_INVALID_TABLE_ADDR, + .hcdp = EFI_INVALID_TABLE_ADDR, + .uga = EFI_INVALID_TABLE_ADDR, + .uv_systab = EFI_INVALID_TABLE_ADDR, +}; EXPORT_SYMBOL(efi); struct efi_memory_map memmap; static struct efi efi_phys __initdata; static efi_system_table_t efi_systab __initdata; +static efi_runtime_services_t phys_runtime; static int __init setup_noefi(char *arg) { @@ -121,7 +132,7 @@ static efi_status_t virt_efi_get_next_va static efi_status_t virt_efi_set_variable(efi_char16_t *name, efi_guid_t *vendor, - unsigned long attr, + u32 attr, unsigned long data_size, void *data) { @@ -130,6 +141,18 @@ static efi_status_t virt_efi_set_variabl data_size, data); } +static efi_status_t virt_efi_query_variable_info(u32 attr, + u64 *storage_space, + u64 *remaining_space, + u64 *max_variable_size) +{ + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + return efi_call_virt4(query_variable_info, attr, storage_space, + remaining_space, max_variable_size); +} + static efi_status_t virt_efi_get_next_high_mono_count(u32 *count) { return efi_call_virt1(get_next_high_mono_count, count); @@ -144,15 +167,26 @@ static void virt_efi_reset_system(int re data_size, data); } -static efi_status_t virt_efi_set_virtual_address_map( - unsigned long memory_map_size, - unsigned long descriptor_size, - u32 descriptor_version, - efi_memory_desc_t *virtual_map) +static efi_status_t virt_efi_update_capsule(efi_capsule_header_t **capsules, + unsigned long count, + unsigned long sg_list) +{ + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + return efi_call_virt3(update_capsule, capsules, count, sg_list); +} + +static efi_status_t virt_efi_query_capsule_caps(efi_capsule_header_t **capsules, + unsigned long count, + u64 *max_size, + int *reset_type) { - return efi_call_virt4(set_virtual_address_map, - memory_map_size, descriptor_size, - descriptor_version, virtual_map); + if (efi.runtime_version < EFI_2_00_SYSTEM_TABLE_REVISION) + return EFI_UNSUPPORTED; + + return efi_call_virt4(query_capsule_caps, capsules, count, max_size, + reset_type); } static efi_status_t __init phys_efi_set_virtual_address_map( @@ -171,7 +205,7 @@ static efi_status_t __init phys_efi_set_ return status; } -static efi_status_t __init phys_efi_get_time(efi_time_t *tm, +static efi_status_t __init phys_efi_get_time_early(efi_time_t *tm, efi_time_cap_t *tc) { efi_status_t status; @@ -182,6 +216,112 @@ static efi_status_t __init phys_efi_get_ return status; } +static efi_status_t phys_efi_get_time(efi_time_t *tm, + efi_time_cap_t *tc) +{ + efi_status_t status; + + efi_call_phys_prelog_in_physmode(); + status = efi_call_phys2((void*)phys_runtime.get_time, tm, tc); + efi_call_phys_epilog_in_physmode(); + return status; +} + +static efi_status_t __init phys_efi_set_time(efi_time_t *tm) +{ + efi_status_t status; + + efi_call_phys_prelog_in_physmode(); + status = efi_call_phys1((void*)phys_runtime.set_time, tm); + efi_call_phys_epilog_in_physmode(); + return status; +} + +static efi_status_t phys_efi_get_wakeup_time(efi_bool_t *enabled, + efi_bool_t *pending, + efi_time_t *tm) +{ + efi_status_t status; + + efi_call_phys_prelog_in_physmode(); + status = efi_call_phys3((void*)phys_runtime.get_wakeup_time, enabled, + pending, tm); + efi_call_phys_epilog_in_physmode(); + return status; +} + +static efi_status_t phys_efi_set_wakeup_time(efi_bool_t enabled, efi_time_t *tm) +{ + efi_status_t status; + efi_call_phys_prelog_in_physmode(); + status = efi_call_phys2((void*)phys_runtime.set_wakeup_time, enabled, + tm); + efi_call_phys_epilog_in_physmode(); + return status; +} + +static efi_status_t phys_efi_get_variable(efi_char16_t *name, + efi_guid_t *vendor, + u32 *attr, + unsigned long *data_size, + void *data) +{ + efi_status_t status; + efi_call_phys_prelog_in_physmode(); + status = efi_call_phys5((void*)phys_runtime.get_variable, name, vendor, + attr, data_size, data); + efi_call_phys_epilog_in_physmode(); + return status; +} + +static efi_status_t phys_efi_get_next_variable(unsigned long *name_size, + efi_char16_t *name, + efi_guid_t *vendor) +{ + efi_status_t status; + + efi_call_phys_prelog_in_physmode(); + status = efi_call_phys3((void*)phys_runtime.get_next_variable, + name_size, name, vendor); + efi_call_phys_epilog_in_physmode(); + return status; +} + +static efi_status_t phys_efi_set_variable(efi_char16_t *name, + efi_guid_t *vendor, + u32 attr, + unsigned long data_size, + void *data) +{ + efi_status_t status; + efi_call_phys_prelog_in_physmode(); + status = efi_call_phys5((void*)phys_runtime.set_variable, name, + vendor, attr, data_size, data); + efi_call_phys_epilog_in_physmode(); + return status; +} + +static efi_status_t phys_efi_get_next_high_mono_count(u32 *count) +{ + efi_status_t status; + efi_call_phys_prelog_in_physmode(); + status = efi_call_phys1((void*)phys_runtime.get_next_high_mono_count, + count); + efi_call_phys_epilog_in_physmode(); + return status; +} + +static void phys_efi_reset_system(int reset_type, + efi_status_t status, + unsigned long data_size, + efi_char16_t *data) +{ + efi_call_phys_prelog_in_physmode(); + efi_call_phys4((void*)phys_runtime.reset_system, reset_type, status, + data_size, data); + efi_call_phys_epilog_in_physmode(); +} + int efi_set_rtc_mmss(unsigned long nowtime) { int real_seconds, real_minutes; @@ -271,6 +411,7 @@ static void __init do_add_efi_memmap(voi break; } e820_add_region(start, size, e820_type); + e820_saved_add_region(start, size, e820_type); } sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); } @@ -314,6 +455,57 @@ static void __init print_efi_memmap(void } #endif /* EFI_DEBUG */ +void __init efi_reserve_boot_services(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + u64 start = md->phys_addr; + u64 size = md->num_pages << EFI_PAGE_SHIFT; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + + /* Only reserve where possible: + * - Not within any already allocated areas + * - Not over any memory area (really needed, if above?) + * - Not within any part of the kernel + * - Not the bios reserved area + */ + + if ((start+size >= virt_to_phys(_text) && + start <= virt_to_phys(_end)) || + !e820_all_mapped(start, start+size, E820_RAM)) { + /* Could not reserve, skip it */ + md->num_pages = 0; + } else { + if (reserve_bootmem_generic(start, size, + BOOTMEM_EXCLUSIVE)) { + md->num_pages = 0; + } + } + } +} + +static void __init efi_free_boot_services(void) +{ + void *p; + + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + efi_memory_desc_t *md = p; + unsigned long long start = md->phys_addr; + unsigned long long size = md->num_pages << EFI_PAGE_SHIFT; + + if (md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) + continue; + + free_bootmem_late(start, size); + } +} + void __init efi_init(void) { efi_config_table_t *config_tables; @@ -362,7 +554,7 @@ void __init efi_init(void) printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); early_iounmap(tmp, 2); - printk(KERN_INFO "EFI v%u.%.02u by %s \n", + printk(KERN_INFO "EFI v%u.%.02u by %s\n", efi.systab->hdr.revision >> 16, efi.systab->hdr.revision & 0xffff, vendor); @@ -434,7 +626,9 @@ void __init efi_init(void) * Make efi_get_time can be called before entering * virtual mode. */ - efi.get_time = phys_efi_get_time; + efi.get_time = phys_efi_get_time_early; + + memcpy(&phys_runtime, runtime, sizeof(efi_runtime_services_t)); } else printk(KERN_ERR "Could not map the EFI runtime service " "table!\n"); @@ -447,10 +641,6 @@ void __init efi_init(void) printk(KERN_ERR "Could not map the EFI memory map!\n"); memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); - if (memmap.desc_size != sizeof(efi_memory_desc_t)) - printk(KERN_WARNING - "Kernel-defined memdesc doesn't match the one from EFI!\n"); - if (add_efi_memmap) do_add_efi_memmap(); @@ -459,19 +649,41 @@ void __init efi_init(void) x86_platform.set_wallclock = efi_set_rtc_mmss; #endif - /* Setup for EFI runtime service */ - reboot_type = BOOT_EFI; - #if EFI_DEBUG print_efi_memmap(); #endif + +#ifndef CONFIG_X86_64 + /* + * Only x86_64 supports physical mode as of now. Use virtual mode + * forcibly. + */ + usevirtefi = 1; +#endif +} + +void __init efi_set_executable(efi_memory_desc_t *md, bool executable) +{ + u64 addr, npages; + + if (md->phys_addr == 0) + return; + + addr = md->virt_addr; + npages = md->num_pages; + + memrange_efi_to_native(&addr, &npages); + + if (executable) + set_memory_x(addr, npages); + else + set_memory_nx(addr, npages); } static void __init runtime_code_page_mkexec(void) { efi_memory_desc_t *md; void *p; - u64 addr, npages; /* Make EFI runtime service code area executable */ for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -480,10 +692,7 @@ static void __init runtime_code_page_mke if (md->type != EFI_RUNTIME_SERVICES_CODE) continue; - addr = md->virt_addr; - npages = md->num_pages; - memrange_efi_to_native(&addr, &npages); - set_memory_x(addr, npages); + efi_set_executable(md, true); } } @@ -497,16 +706,46 @@ static void __init runtime_code_page_mke */ void __init efi_enter_virtual_mode(void) { - efi_memory_desc_t *md; + efi_memory_desc_t *md, *prev_md = NULL; efi_status_t status; unsigned long size; u64 end, systab, addr, npages, end_pfn; void *p, *va; efi.systab = NULL; + + /* Merge contiguous regions of the same type and attribute */ + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { + u64 prev_size; + md = p; + + if (!prev_md) { + prev_md = md; + continue; + } + + if (prev_md->type != md->type || + prev_md->attribute != md->attribute) { + prev_md = md; + continue; + } + + prev_size = prev_md->num_pages << EFI_PAGE_SHIFT; + + if (md->phys_addr == (prev_md->phys_addr + prev_size)) { + prev_md->num_pages += md->num_pages; + md->type = EFI_RESERVED_TYPE; + md->attribute = 0; + continue; + } + prev_md = md; + } + for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME)) + if (!(md->attribute & EFI_MEMORY_RUNTIME) && + md->type != EFI_BOOT_SERVICES_CODE && + md->type != EFI_BOOT_SERVICES_DATA) continue; size = md->num_pages << EFI_PAGE_SHIFT; @@ -528,7 +767,8 @@ void __init efi_enter_virtual_mode(void) continue; } - if (!(md->attribute & EFI_MEMORY_WB)) { + if ((md->attribute & EFI_MEMORY_UC) && + !(md->attribute & EFI_MEMORY_WB)) { addr = md->virt_addr; npages = md->num_pages; memrange_efi_to_native(&addr, &npages); @@ -557,11 +797,19 @@ void __init efi_enter_virtual_mode(void) } /* + * Thankfully, it does seem that no runtime services other than + * SetVirtualAddressMap() will touch boot services code, so we can + * get rid of it all at this point + */ + efi_free_boot_services(); + + /* * Now that EFI is in virtual mode, update the function * pointers in the runtime service table to the new virtual addresses. * * Call EFI services through wrapper functions. */ + efi.runtime_version = efi_systab.hdr.revision; efi.get_time = virt_efi_get_time; efi.set_time = virt_efi_set_time; efi.get_wakeup_time = virt_efi_get_wakeup_time; @@ -571,13 +819,37 @@ void __init efi_enter_virtual_mode(void) efi.set_variable = virt_efi_set_variable; efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; efi.reset_system = virt_efi_reset_system; - efi.set_virtual_address_map = virt_efi_set_virtual_address_map; + efi.set_virtual_address_map = NULL; + efi.query_variable_info = virt_efi_query_variable_info; + efi.update_capsule = virt_efi_update_capsule; + efi.query_capsule_caps = virt_efi_query_capsule_caps; if (__supported_pte_mask & _PAGE_NX) runtime_code_page_mkexec(); early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); memmap.map = NULL; } +void __init efi_setup_physical_mode(void) +{ +#ifdef CONFIG_X86_64 + efi_pagetable_init(); +#endif + efi.get_time = phys_efi_get_time; + efi.set_time = phys_efi_set_time; + efi.get_wakeup_time = phys_efi_get_wakeup_time; + efi.set_wakeup_time = phys_efi_set_wakeup_time; + efi.get_variable = phys_efi_get_variable; + efi.get_next_variable = phys_efi_get_next_variable; + efi.set_variable = phys_efi_set_variable; + efi.get_next_high_mono_count = + phys_efi_get_next_high_mono_count; + efi.reset_system = phys_efi_reset_system; + efi.set_virtual_address_map = NULL; /* Not needed */ + + early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); + memmap.map = NULL; +} + /* * Convenience functions to obtain memory types and attributes */ diff -uprN linux-2.6.32/arch/x86/kernel/efi_early_printk.c linux-2.6.32.ovz/arch/x86/kernel/efi_early_printk.c --- linux-2.6.32/arch/x86/kernel/efi_early_printk.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/efi_early_printk.c 2015-03-10 13:24:10.000000000 -0700 @@ -0,0 +1,236 @@ +/* + * Copyright (C) 2013 Intel Corporation; author Matt Fleming + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2. + */ + +#include +#include +#include +#include +#include +#include + +static const struct font_desc *font; +static u32 efi_x, efi_y; +static void *efi_fb; +static bool early_efi_keep; + +/* + * efi earlyprintk need use early_ioremap to map the framebuffer. + * But early_ioremap is not usable for earlyprintk=efi,keep, ioremap should + * be used instead. ioremap will be available after paging_init() which is + * earlier than initcall callbacks. Thus adding this early initcall function + * early_efi_map_fb to map the whole efi framebuffer. + */ +static __init int early_efi_map_fb(void) +{ + unsigned long base, size; + + if (!early_efi_keep) + return 0; + + base = boot_params.screen_info.lfb_base; + size = boot_params.screen_info.lfb_size; + efi_fb = ioremap(base, size); + + return efi_fb ? 0 : -ENOMEM; +} +early_initcall(early_efi_map_fb); + +/* + * early_efi_map maps efi framebuffer region [start, start + len -1] + * In case earlyprintk=efi,keep we have the whole framebuffer mapped already + * so just return the offset efi_fb + start. + */ +static __init_refok void *early_efi_map(unsigned long start, unsigned long len) +{ + unsigned long base; + + base = boot_params.screen_info.lfb_base; + + if (efi_fb) + return (efi_fb + start); + else + return early_ioremap(base + start, len); +} + +static __init_refok void early_efi_unmap(void *addr, unsigned long len) +{ + if (!efi_fb) + early_iounmap(addr, len); +} + +static void early_efi_clear_scanline(unsigned int y) +{ + unsigned long *dst; + u16 len; + + len = boot_params.screen_info.lfb_linelength; + dst = early_efi_map(y*len, len); + if (!dst) + return; + + memset(dst, 0, len); + early_efi_unmap(dst, len); +} + +static void early_efi_scroll_up(void) +{ + unsigned long *dst, *src; + u16 len; + u32 i, height; + + len = boot_params.screen_info.lfb_linelength; + height = boot_params.screen_info.lfb_height; + + for (i = 0; i < height - font->height; i++) { + dst = early_efi_map(i*len, len); + if (!dst) + return; + + src = early_efi_map((i + font->height) * len, len); + if (!src) { + early_efi_unmap(dst, len); + return; + } + + memmove(dst, src, len); + + early_efi_unmap(src, len); + early_efi_unmap(dst, len); + } +} + +static void early_efi_write_char(u32 *dst, unsigned char c, unsigned int h) +{ + const u32 color_black = 0x00000000; + const u32 color_white = 0x00ffffff; + const u8 *src; + u8 s8; + int m; + + src = font->data + c * font->height; + s8 = *(src + h); + + for (m = 0; m < 8; m++) { + if ((s8 >> (7 - m)) & 1) + *dst = color_white; + else + *dst = color_black; + dst++; + } +} + +static void +early_efi_write(struct console *con, const char *str, unsigned int num) +{ + struct screen_info *si; + unsigned int len; + const char *s; + void *dst; + + si = &boot_params.screen_info; + len = si->lfb_linelength; + + while (num) { + unsigned int linemax; + unsigned int h, count = 0; + + for (s = str; *s && *s != '\n'; s++) { + if (count == num) + break; + count++; + } + + linemax = (si->lfb_width - efi_x) / font->width; + if (count > linemax) + count = linemax; + + for (h = 0; h < font->height; h++) { + unsigned int n, x; + + dst = early_efi_map((efi_y + h) * len, len); + if (!dst) + return; + + s = str; + n = count; + x = efi_x; + + while (n-- > 0) { + early_efi_write_char(dst + x*4, *s, h); + x += font->width; + s++; + } + + early_efi_unmap(dst, len); + } + + num -= count; + efi_x += count * font->width; + str += count; + + if (num > 0 && *s == '\n') { + efi_x = 0; + efi_y += font->height; + str++; + num--; + } + + if (efi_x >= si->lfb_width) { + efi_x = 0; + efi_y += font->height; + } + + if (efi_y + font->height > si->lfb_height) { + u32 i; + + efi_y -= font->height; + early_efi_scroll_up(); + + for (i = 0; i < font->height; i++) + early_efi_clear_scanline(efi_y + i); + } + } +} + +static __init int early_efi_setup(struct console *con, char *options) +{ + struct screen_info *si; + u16 xres, yres; + u32 i; + + si = &boot_params.screen_info; + xres = si->lfb_width; + yres = si->lfb_height; + + /* + * early_efi_write_char() implicitly assumes a framebuffer with + * 32-bits per pixel. + */ + if (si->lfb_depth != 32) + return -ENODEV; + + font = get_default_font(xres, yres, -1, -1); + if (!font) + return -ENODEV; + + efi_y = rounddown(yres, font->height) - font->height; + for (i = 0; i < (yres - efi_y) / font->height; i++) + early_efi_scroll_up(); + + /* early_console_register will unset CON_BOOT in case ,keep */ + if (!(con->flags & CON_BOOT)) + early_efi_keep = true; + return 0; +} + +struct console early_efi_console = { + .name = "earlyefi", + .write = early_efi_write, + .setup = early_efi_setup, + .flags = CON_PRINTBUFFER, + .index = -1, +}; diff -uprN linux-2.6.32/arch/x86/kernel/entry_32.S linux-2.6.32.ovz/arch/x86/kernel/entry_32.S --- linux-2.6.32/arch/x86/kernel/entry_32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/entry_32.S 2015-06-23 04:16:16.000000000 -0700 @@ -53,6 +53,8 @@ #include #include #include +#include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -325,6 +327,7 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%ebp) popl %eax CFI_ADJUST_CFA_OFFSET -4 +ret_from_fork_tail: pushl $0x0202 # Reset kernel eflags CFI_ADJUST_CFA_OFFSET 4 popfl @@ -333,6 +336,29 @@ ENTRY(ret_from_fork) CFI_ENDPROC END(ret_from_fork) +ENTRY(i386_ret_from_resume) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + call schedule_tail + GET_THREAD_INFO(%ebp) + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + movl (%esp),%eax + testl %eax,%eax + jz 1f + pushl %esp + call *%eax + addl $4,%esp +1: + addl $256,%esp + jmp ret_from_fork_tail + CFI_ENDPROC + +/* + * Interrupt exit functions should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" /* * Return to user mode is not as complex as all this looks, * but we want the default path for a system call return to @@ -383,6 +409,10 @@ need_resched: END(resume_kernel) #endif CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* SYSENTER_RETURN points to after the "sysenter" instruction in the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ @@ -488,11 +518,10 @@ sysexit_audit: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ - cmpl $0,%eax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%eax /* zero-extend that */ - inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx @@ -513,6 +542,10 @@ sysexit_audit: PTGS_TO_GS_EX ENDPROC(ia32_sysenter_target) +/* + * syscall stub including irq exit should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" # system call handler stub ENTRY(system_call) RING0_INT_FRAME # can't unwind into user space anyway @@ -705,6 +738,10 @@ syscall_badsys: jmp resume_userspace END(syscall_badsys) CFI_ENDPROC +/* + * End of kprobes section + */ + .popsection /* * System calls that need a pt_regs pointer. @@ -814,6 +851,10 @@ common_interrupt: ENDPROC(common_interrupt) CFI_ENDPROC +/* + * Irq entries should be protected against kprobes + */ + .pushsection .kprobes.text, "ax" #define BUILD_INTERRUPT3(name, nr, fn) \ ENTRY(name) \ RING0_INT_FRAME; \ @@ -827,7 +868,17 @@ ENTRY(name) \ CFI_ENDPROC; \ ENDPROC(name) -#define BUILD_INTERRUPT(name, nr) BUILD_INTERRUPT3(name, nr, smp_##name) + +#ifdef CONFIG_TRACING +#define TRACE_BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(trace_##name, nr, smp_trace_##name) +#else +#define TRACE_BUILD_INTERRUPT(name, nr) +#endif + +#define BUILD_INTERRUPT(name, nr) \ + BUILD_INTERRUPT3(name, nr, smp_##name); \ + TRACE_BUILD_INTERRUPT(name, nr) /* The include is where all of the SMP etc. interrupts come from */ #include @@ -846,7 +897,25 @@ ENTRY(simd_coprocessor_error) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ +661: pushl $do_general_protection +662: +.section .altinstructions,"a" + .balign 4 + .long 661b + .long 663f + .word X86_FEATURE_XMM + .byte 662b-661b + .byte 664f-663f +.previous +.section .altinstr_replacement,"ax" +663: pushl $do_simd_coprocessor_error +664: +.previous +#else pushl $do_simd_coprocessor_error +#endif CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -980,6 +1049,10 @@ ENTRY(spurious_interrupt_bug) jmp error_code CFI_ENDPROC END(spurious_interrupt_bug) +/* + * End of kprobes section + */ + .popsection ENTRY(kernel_thread_helper) pushl $0 # fake return address for unwinder @@ -1057,7 +1130,6 @@ ENTRY(xen_failsafe_callback) lea 16(%esp),%esp CFI_ADJUST_CFA_OFFSET -16 jz 5f - addl $16,%esp jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) 5: pushl $0 # EAX == 0 => Category 1 (Bad segment) CFI_ADJUST_CFA_OFFSET 4 @@ -1088,8 +1160,18 @@ ENTRY(xen_failsafe_callback) .previous ENDPROC(xen_failsafe_callback) +BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + xen_evtchn_do_upcall) + #endif /* CONFIG_XEN */ +#if defined(CONFIG_HYPERV) || defined(CONFIG_HYPERV_MODULE) + +BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, + hyperv_vector_handler) + +#endif /* CONFIG_HYPERV */ + #ifdef CONFIG_FUNCTION_TRACER #ifdef CONFIG_DYNAMIC_FTRACE diff -uprN linux-2.6.32/arch/x86/kernel/entry_64.S linux-2.6.32.ovz/arch/x86/kernel/entry_64.S --- linux-2.6.32/arch/x86/kernel/entry_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/entry_64.S 2015-06-23 04:16:16.000000000 -0700 @@ -53,6 +53,7 @@ #include #include #include +#include /* Avoid __ASSEMBLER__'ifying just for this. */ #include @@ -405,8 +406,12 @@ ENTRY(ret_from_fork) call schedule_tail # rdi: 'prev' task parameter +ret_from_fork_tail: GET_THREAD_INFO(%rcx) + LOCK ; btr $TIF_RESUME,TI_flags(%rcx) + jc x86_64_ret_from_resume +ret_from_fork_check: RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? @@ -418,6 +423,18 @@ ENTRY(ret_from_fork) RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET jmp ret_from_sys_call # go to the SYSRET fastpath +x86_64_ret_from_resume: + movq (%rsp),%rax + testq %rax,%rax + jz 1f + movq %rsp,%rdi + call *%rax +1: + addq $256,%rsp + cmpq $0,ORIG_RAX(%rsp) + jge ret_from_fork_tail + RESTORE_REST + jmp int_ret_from_sys_call CFI_ENDPROC END(ret_from_fork) @@ -566,17 +583,16 @@ auditsys: jmp system_call_fastpath /* - * Return fast path for syscall audit. Call audit_syscall_exit() + * Return fast path for syscall audit. Call __audit_syscall_exit() * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT * masked off. */ sysret_audit: - movq %rax,%rsi /* second arg, syscall return value */ - cmpq $0,%rax /* is it < 0? */ - setl %al /* 1 if so, 0 if not */ + movq RAX-ARGOFFSET(%rsp), %rsi /* second arg, syscall return value */ + cmpq $-MAX_ERRNO,%rsi /* is it < -MAX_ERRNO? */ + setbe %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ - inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ - call audit_syscall_exit + call __audit_syscall_exit movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi jmp sysret_check #endif /* CONFIG_AUDITSYSCALL */ @@ -945,7 +961,7 @@ END(common_interrupt) /* * APIC interrupts. */ -.macro apicinterrupt num sym do_sym +.macro apicinterrupt3 num sym do_sym ENTRY(\sym) INTR_FRAME pushq $~(\num) @@ -956,48 +972,70 @@ ENTRY(\sym) END(\sym) .endm +#ifdef CONFIG_TRACING +#define trace(sym) trace_##sym +#define smp_trace(sym) smp_trace_##sym + +.macro trace_apicinterrupt num sym +apicinterrupt3 \num trace(\sym) smp_trace(\sym) +.endm +#else +.macro trace_apicinterrupt num sym do_sym +.endm +#endif + +.macro apicinterrupt num sym do_sym +apicinterrupt3 \num \sym \do_sym +trace_apicinterrupt \num \sym +.endm + #ifdef CONFIG_SMP -apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \ +apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR \ irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt -apicinterrupt REBOOT_VECTOR \ +apicinterrupt3 REBOOT_VECTOR \ reboot_interrupt smp_reboot_interrupt #endif #ifdef CONFIG_X86_UV -apicinterrupt UV_BAU_MESSAGE \ +apicinterrupt3 UV_BAU_MESSAGE \ uv_bau_message_intr1 uv_bau_message_interrupt #endif apicinterrupt LOCAL_TIMER_VECTOR \ apic_timer_interrupt smp_apic_timer_interrupt -apicinterrupt GENERIC_INTERRUPT_VECTOR \ - generic_interrupt smp_generic_interrupt +apicinterrupt X86_PLATFORM_IPI_VECTOR \ + x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ +apicinterrupt3 INVALIDATE_TLB_VECTOR_START+0 \ invalidate_interrupt0 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ +apicinterrupt3 INVALIDATE_TLB_VECTOR_START+1 \ invalidate_interrupt1 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ +apicinterrupt3 INVALIDATE_TLB_VECTOR_START+2 \ invalidate_interrupt2 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ +apicinterrupt3 INVALIDATE_TLB_VECTOR_START+3 \ invalidate_interrupt3 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \ +apicinterrupt3 INVALIDATE_TLB_VECTOR_START+4 \ invalidate_interrupt4 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \ +apicinterrupt3 INVALIDATE_TLB_VECTOR_START+5 \ invalidate_interrupt5 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \ +apicinterrupt3 INVALIDATE_TLB_VECTOR_START+6 \ invalidate_interrupt6 smp_invalidate_interrupt -apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \ +apicinterrupt3 INVALIDATE_TLB_VECTOR_START+7 \ invalidate_interrupt7 smp_invalidate_interrupt #endif +#ifdef CONFIG_X86_MCE_THRESHOLD apicinterrupt THRESHOLD_APIC_VECTOR \ threshold_interrupt smp_threshold_interrupt +#endif + +#ifdef CONFIG_X86_THERMAL_VECTOR apicinterrupt THERMAL_APIC_VECTOR \ thermal_interrupt smp_thermal_interrupt +#endif #ifdef CONFIG_X86_MCE -apicinterrupt MCE_SELF_VECTOR \ +apicinterrupt3 MCE_SELF_VECTOR \ mce_self_interrupt smp_mce_self_interrupt #endif @@ -1015,11 +1053,17 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_EVENTS -apicinterrupt LOCAL_PENDING_VECTOR \ - perf_pending_interrupt smp_perf_pending_interrupt +#ifdef CONFIG_IRQ_WORK +apicinterrupt IRQ_WORK_VECTOR \ + irq_work_interrupt smp_irq_work_interrupt #endif +apicinterrupt MONITOR_IPI_VECTOR \ + monitor_ipi smp_monitor_ipi + +apicinterrupt MONITOR_POSTED_INTERRUPT_VECTOR \ + monitor_posted_interrupt smp_monitor_posted_interrupt + /* * Exception entry points. */ @@ -1182,7 +1226,7 @@ ENTRY(kernel_thread) xorl %r9d,%r9d # clone now - call do_fork + call do_fork_kthread movq %rax,RAX(%rsp) xorl %edi,%edi @@ -1303,7 +1347,7 @@ ENTRY(xen_do_hypervisor_callback) # do decl PER_CPU_VAR(irq_count) jmp error_exit CFI_ENDPROC -END(do_hypervisor_callback) +END(xen_do_hypervisor_callback) /* * Hypervisor uses this for application faults while it executes. @@ -1364,8 +1408,16 @@ ENTRY(xen_failsafe_callback) CFI_ENDPROC END(xen_failsafe_callback) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + xen_hvm_callback_vector xen_evtchn_do_upcall + #endif /* CONFIG_XEN */ +#if defined(CONFIG_HYPERV) || defined(CONFIG_HYPERV_MODULE) +apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ + hyperv_callback_vector hyperv_vector_handler +#endif /* CONFIG_HYPERV */ + /* * Some functions should be protected against kprobes */ @@ -1373,7 +1425,7 @@ END(xen_failsafe_callback) paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK -paranoiderrorentry stack_segment do_stack_segment +errorentry stack_segment do_stack_segment #ifdef CONFIG_XEN zeroentry xen_debug do_debug zeroentry xen_int3 do_int3 diff -uprN linux-2.6.32/arch/x86/kernel/ftrace.c linux-2.6.32.ovz/arch/x86/kernel/ftrace.c --- linux-2.6.32/arch/x86/kernel/ftrace.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/ftrace.c 2015-03-10 13:24:02.000000000 -0700 @@ -25,6 +25,15 @@ #include #include +#if defined(CONFIG_FTRACE_SYSCALLS) && defined(CONFIG_IA32_EMULATION) +#include +bool arch_trace_is_compat_syscall(struct pt_regs *regs) +{ + if (is_compat_task()) + return true; + return false; +} +#endif /* CONFIG_FTRACE_SYSCALLS && CONFIG_IA32_EMULATION */ #ifdef CONFIG_DYNAMIC_FTRACE @@ -187,9 +196,26 @@ static void wait_for_nmi(void) nmi_wait_count++; } +static inline int +within(unsigned long addr, unsigned long start, unsigned long end) +{ + return addr >= start && addr < end; +} + static int do_ftrace_mod_code(unsigned long ip, void *new_code) { + /* + * On x86_64, kernel text mappings are mapped read-only with + * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead + * of the kernel text mapping to modify the kernel text. + * + * For 32bit kernels, these mappings are same and we can use + * kernel identity mapping to modify code. + */ + if (within(ip, (unsigned long)_text, (unsigned long)_etext)) + ip = (unsigned long)__va(__pa(ip)); + mod_code_ip = (void *)ip; mod_code_newcode = new_code; @@ -468,26 +494,26 @@ void prepare_ftrace_return(unsigned long #ifdef CONFIG_FTRACE_SYSCALLS -extern unsigned long __start_syscalls_metadata[]; -extern unsigned long __stop_syscalls_metadata[]; +extern struct syscall_metadata *__start_syscalls_metadata[]; +extern struct syscall_metadata *__stop_syscalls_metadata[]; extern unsigned long *sys_call_table; static struct syscall_metadata **syscalls_metadata; static struct syscall_metadata *find_syscall_meta(unsigned long *syscall) { - struct syscall_metadata *start; - struct syscall_metadata *stop; + struct syscall_metadata **start; + struct syscall_metadata **stop; char str[KSYM_SYMBOL_LEN]; - start = (struct syscall_metadata *)__start_syscalls_metadata; - stop = (struct syscall_metadata *)__stop_syscalls_metadata; + start = __start_syscalls_metadata; + stop = __stop_syscalls_metadata; kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); for ( ; start < stop; start++) { - if (start->name && !strcmp(start->name, str)) - return start; + if ((*start)->name && !strcmp((*start)->name, str)) + return *start; } return NULL; } @@ -500,7 +526,7 @@ struct syscall_metadata *syscall_nr_to_m return syscalls_metadata[nr]; } -int syscall_name_to_nr(char *name) +int syscall_name_to_nr(const char *name) { int i; diff -uprN linux-2.6.32/arch/x86/kernel/head32.c linux-2.6.32.ovz/arch/x86/kernel/head32.c --- linux-2.6.32/arch/x86/kernel/head32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/head32.c 2015-03-10 13:22:14.000000000 -0700 @@ -20,7 +20,6 @@ static void __init i386_default_early_setup(void) { /* Initilize 32bit specific setup functions */ - x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; diff -uprN linux-2.6.32/arch/x86/kernel/head_32.S linux-2.6.32.ovz/arch/x86/kernel/head_32.S --- linux-2.6.32/arch/x86/kernel/head_32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/head_32.S 2015-03-10 13:22:14.000000000 -0700 @@ -324,7 +324,7 @@ ENTRY(startup_32_smp) /* * Enable paging */ - movl $pa(swapper_pg_dir),%eax + movl pa(initial_page_table), %eax movl %eax,%cr3 /* set the page table pointer.. */ movl %cr0,%eax orl $X86_CR0_PG,%eax @@ -604,6 +604,8 @@ ignore_int: .align 4 ENTRY(initial_code) .long i386_start_kernel +ENTRY(initial_page_table) + .long pa(swapper_pg_dir) /* * BSS section @@ -619,6 +621,10 @@ ENTRY(swapper_pg_dir) #endif swapper_pg_fixmap: .fill 1024,4,0 +#ifdef CONFIG_X86_TRAMPOLINE +ENTRY(trampoline_pg_dir) + .fill 1024,4,0 +#endif ENTRY(empty_zero_page) .fill 4096,1,0 diff -uprN linux-2.6.32/arch/x86/kernel/head_64.S linux-2.6.32.ovz/arch/x86/kernel/head_64.S --- linux-2.6.32/arch/x86/kernel/head_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/head_64.S 2015-03-10 13:24:02.000000000 -0700 @@ -262,11 +262,11 @@ ENTRY(secondary_startup_64) .quad x86_64_start_kernel ENTRY(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - __FINITDATA ENTRY(stack_start) .quad init_thread_union+THREAD_SIZE-8 .word 0 + __FINITDATA bad_address: jmp bad_address @@ -340,6 +340,7 @@ ENTRY(name) i = i + 1 ; \ .endr + .data /* * This default setting generates an ident mapping at address 0x100000 * and a mapping for the kernel that precisely maps virtual address @@ -418,6 +419,12 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 +#ifdef CONFIG_TRACING + .align L1_CACHE_BYTES +ENTRY(trace_idt_table) + .skip IDT_ENTRIES * 16 +#endif + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) diff -uprN linux-2.6.32/arch/x86/kernel/hpet.c linux-2.6.32.ovz/arch/x86/kernel/hpet.c --- linux-2.6.32/arch/x86/kernel/hpet.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/hpet.c 2015-03-10 13:24:13.000000000 -0700 @@ -27,12 +27,18 @@ #define HPET_DEV_FSB_CAP 0x1000 #define HPET_DEV_PERI_CAP 0x2000 +#define HPET_MIN_CYCLES 128 +#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) + #define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +u8 hpet_blockid; /* OS timer block num */ +u8 hpet_msi_disable; + #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; #endif @@ -296,9 +302,9 @@ static void hpet_legacy_clockevent_regis /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - /* 5 usec minimum reprogramming delta. */ - hpet_clockevent.min_delta_ns = 5000; - + /* Setup minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA, + &hpet_clockevent); /* * Start hpet with the boot cpu mask and make it * global after the IO_APIC has been initialized. @@ -377,19 +383,37 @@ static int hpet_next_event(unsigned long struct clock_event_device *evt, int timer) { u32 cnt; + s32 res; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; hpet_writel(cnt, HPET_Tn_CMP(timer)); /* - * We need to read back the CMP register to make sure that - * what we wrote hit the chip before we compare it to the - * counter. + * HPETs are a complete disaster. The compare register is + * based on a equal comparison and neither provides a less + * than or equal functionality (which would require to take + * the wraparound into account) nor a simple count down event + * mode. Further the write to the comparator register is + * delayed internally up to two HPET clock cycles in certain + * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even + * longer delays. We worked around that by reading back the + * compare register, but that required another workaround for + * ICH9,10 chips where the first readout after write can + * return the old stale value. We already had a minimum + * programming delta of 5us enforced, but a NMI or SMI hitting + * between the counter readout and the comparator write can + * move us behind that point easily. Now instead of reading + * the compare register back several times, we make the ETIME + * decision based on the following: Return ETIME if the + * counter value after the write is less than HPET_MIN_CYCLES + * away from the event or if the counter is already ahead of + * the event. The minimum programming delta for the generic + * clockevents code is set to 1.5 * HPET_MIN_CYCLES. */ - WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); + res = (s32)(cnt - (u32)hpet_readl(HPET_COUNTER)); - return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; + return res < HPET_MIN_CYCLES ? -ETIME : 0; } static void hpet_legacy_set_mode(enum clock_event_mode mode, @@ -467,7 +491,7 @@ static int hpet_msi_next_event(unsigned static int hpet_setup_msi_irq(unsigned int irq) { - if (arch_setup_hpet_msi(irq)) { + if (arch_setup_hpet_msi(irq, hpet_blockid)) { destroy_irq(irq); return -EINVAL; } @@ -478,7 +502,7 @@ static int hpet_assign_irq(struct hpet_d { unsigned int irq; - irq = create_irq(); + irq = create_irq_nr(0, -1); if (!irq) return -EINVAL; @@ -563,8 +587,8 @@ static void init_one_hpet_msi_clockevent NSEC_PER_SEC, evt->shift); /* Calculate the max delta */ evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); - /* 5 usec minimum reprogramming delta. */ - evt->min_delta_ns = 5000; + /* Setup minimum reprogramming delta. */ + evt->min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA, evt); evt->cpumask = cpumask_of(hdev->cpu); clockevents_register_device(evt); @@ -584,6 +608,9 @@ static void hpet_msi_capability_lookup(u unsigned int num_timers_used = 0; int i; + if (hpet_msi_disable) + return; + id = hpet_readl(HPET_ID); num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); @@ -872,10 +899,8 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); - hpet_msi_capability_lookup(2); return 1; } - hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -908,9 +933,17 @@ static __init int hpet_late_init(void) if (!hpet_virt_address) return -ENODEV; + if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP) + hpet_msi_capability_lookup(2); + else + hpet_msi_capability_lookup(0); + hpet_reserve_platform_timers(hpet_readl(HPET_ID)); hpet_print_config(); + if (hpet_msi_disable) + return 0; + for_each_online_cpu(cpu) { hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); } @@ -924,16 +957,18 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { - if (is_hpet_capable()) { - unsigned long cfg = hpet_readl(HPET_CFG); + unsigned long cfg; - if (hpet_legacy_int_enabled) { - cfg &= ~HPET_CFG_LEGACY; - hpet_legacy_int_enabled = 0; - } - cfg &= ~HPET_CFG_ENABLE; - hpet_writel(cfg, HPET_CFG); + if (!is_hpet_capable() || !hpet_address || !hpet_virt_address) + return; + + cfg = hpet_readl(HPET_CFG); + if (hpet_legacy_int_enabled) { + cfg &= ~HPET_CFG_LEGACY; + hpet_legacy_int_enabled = 0; } + cfg &= ~HPET_CFG_ENABLE; + hpet_writel(cfg, HPET_CFG); } #ifdef CONFIG_HPET_EMULATE_RTC @@ -1052,6 +1087,14 @@ int hpet_rtc_timer_init(void) } EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); +static void hpet_disable_rtc_channel(void) +{ + unsigned long cfg; + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); +} + /* * The functions below are called from rtc driver. * Return 0 if HPET is not being used. @@ -1063,6 +1106,9 @@ int hpet_mask_rtc_irq_bit(unsigned long return 0; hpet_rtc_flags &= ~bit_mask; + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); + return 1; } EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); @@ -1127,15 +1173,11 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { - unsigned long cfg, delta; + unsigned long delta; int lost_ints = -1; - if (unlikely(!hpet_rtc_flags)) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) delta = hpet_default_delta; diff -uprN linux-2.6.32/arch/x86/kernel/i387.c linux-2.6.32.ovz/arch/x86/kernel/i387.c --- linux-2.6.32/arch/x86/kernel/i387.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/i387.c 2015-06-23 04:16:16.000000000 -0700 @@ -39,6 +39,7 @@ static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; +EXPORT_SYMBOL_GPL(xstate_size); unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; @@ -101,69 +102,73 @@ void __cpuinit fpu_init(void) mxcsr_feature_mask_init(); /* clean state in init */ - if (cpu_has_xsave) - current_thread_info()->status = TS_XSAVE; - else - current_thread_info()->status = 0; + current_thread_info()->status = 0; clear_used_math(); } #endif /* CONFIG_X86_64 */ -/* - * The _current_ task is using the FPU for the first time - * so initialize it and set the mxcsr to its default - * value at reset if we support XMM instructions and then - * remeber the current task has used the FPU. - */ -int init_fpu(struct task_struct *tsk) +void fpu_finit(struct fpu *fpu) { - if (tsk_used_math(tsk)) { - if (HAVE_HWFP && tsk == current) - unlazy_fpu(tsk); - return 0; - } - - /* - * Memory allocation at the first usage of the FPU and other state. - */ - if (!tsk->thread.xstate) { - tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep, - GFP_KERNEL); - if (!tsk->thread.xstate) - return -ENOMEM; - } - #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { - memset(tsk->thread.xstate, 0, xstate_size); - finit_task(tsk); - set_stopped_child_used_math(tsk); - return 0; + finit_soft_fpu(&fpu->state->soft); + return; } #endif if (cpu_has_fxsr) { - struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; + struct i387_fxsave_struct *fx = &fpu->state->fxsave; memset(fx, 0, xstate_size); fx->cwd = 0x37f; if (cpu_has_xmm) fx->mxcsr = MXCSR_DEFAULT; } else { - struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; + struct i387_fsave_struct *fp = &fpu->state->fsave; memset(fp, 0, xstate_size); fp->cwd = 0xffff037fu; fp->swd = 0xffff0000u; fp->twd = 0xffffffffu; fp->fos = 0xffff0000u; } +} +EXPORT_SYMBOL_GPL(fpu_finit); + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remeber the current task has used the FPU. + */ +int init_fpu(struct task_struct *tsk) +{ + int ret; + + if (tsk_used_math(tsk)) { + if (HAVE_HWFP && tsk == current) + unlazy_fpu(tsk); + return 0; + } + /* - * Only the device not available exception or ptrace can call init_fpu. + * Memory allocation at the first usage of the FPU and other state. */ + ret = fpu_alloc((struct fpu *)&tsk->thread.xstate); + if (ret) + return ret; + + fpu_finit((struct fpu *)&tsk->thread.xstate); + set_stopped_child_used_math(tsk); return 0; } +EXPORT_SYMBOL(init_fpu); +/* + * The xstateregs_active() routine is the same as the fpregs_active() routine, + * as the "regset->n" for the xstate regset will be updated based on the feature + * capabilites supported by the xsave. + */ int fpregs_active(struct task_struct *target, const struct user_regset *regset) { return tsk_used_math(target) ? regset->n : 0; @@ -187,6 +192,8 @@ int xfpregs_get(struct task_struct *targ if (ret) return ret; + sanitize_i387_state(target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fxsave, 0, -1); } @@ -204,7 +211,7 @@ int xfpregs_set(struct task_struct *targ if (ret) return ret; - set_stopped_child_used_math(target); + sanitize_i387_state(target); ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &target->thread.xstate->fxsave, 0, -1); @@ -224,6 +231,68 @@ int xfpregs_set(struct task_struct *targ return ret; } +int xstateregs_get(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + int ret; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + /* + * Copy the 48bytes defined by the software first into the xstate + * memory layout in the thread struct, so that we can copy the entire + * xstateregs to the user using one user_regset_copyout(). + */ + memcpy(&target->thread.xstate->fxsave.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); + + /* + * Copy the xstate memory layout. + */ + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + return ret; +} + +int xstateregs_set(struct task_struct *target, const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + int ret; + struct xsave_hdr_struct *xsave_hdr; + + if (!cpu_has_xsave) + return -ENODEV; + + ret = init_fpu(target); + if (ret) + return ret; + + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &target->thread.xstate->xsave, 0, -1); + + /* + * mxcsr reserved bits must be masked to zero for security reasons. + */ + target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + + xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; + + xsave_hdr->xstate_bv &= pcntxt_mask; + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + return ret; +} + #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION /* @@ -383,6 +452,8 @@ int fpregs_get(struct task_struct *targe -1); } + sanitize_i387_state(target); + if (kbuf && pos == 0 && count == sizeof(env)) { convert_from_fxsr(kbuf, target); return 0; @@ -404,7 +475,7 @@ int fpregs_set(struct task_struct *targe if (ret) return ret; - set_stopped_child_used_math(target); + sanitize_i387_state(target); if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); @@ -472,6 +543,9 @@ static int save_i387_xsave(void __user * struct _fpstate_ia32 __user *fx = buf; int err = 0; + + sanitize_i387_state(tsk); + /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. diff -uprN linux-2.6.32/arch/x86/kernel/i8259.c linux-2.6.32.ovz/arch/x86/kernel/i8259.c --- linux-2.6.32/arch/x86/kernel/i8259.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/i8259.c 2015-03-10 13:24:12.000000000 -0700 @@ -34,6 +34,12 @@ static int i8259A_auto_eoi; DEFINE_SPINLOCK(i8259A_lock); static void mask_and_ack_8259A(unsigned int); +static void mask_8259A(void); +static void unmask_8259A(void); +static void disable_8259A_irq(unsigned int irq); +static void enable_8259A_irq(unsigned int irq); +static void init_8259A(int auto_eoi); +static int i8259A_irq_pending(unsigned int irq); struct irq_chip i8259A_chip = { .name = "XT-PIC", @@ -63,7 +69,7 @@ unsigned int cached_irq_mask = 0xffff; */ unsigned long io_apic_irqs; -void disable_8259A_irq(unsigned int irq) +static void disable_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; @@ -77,7 +83,7 @@ void disable_8259A_irq(unsigned int irq) spin_unlock_irqrestore(&i8259A_lock, flags); } -void enable_8259A_irq(unsigned int irq) +static void enable_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; @@ -91,7 +97,7 @@ void enable_8259A_irq(unsigned int irq) spin_unlock_irqrestore(&i8259A_lock, flags); } -int i8259A_irq_pending(unsigned int irq) +static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<affinity; - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - printk("Breaking affinity for irq %i\n", irq); - affinity = cpu_all_mask; - } - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (desc->action) - printk_once("Cannot set affinity for irq %i\n", irq); - } - -#if 0 - barrier(); - /* Ingo Molnar says: "after the IO-APIC masks have been redirected - [note the nop - the interrupt-enable boundary on x86 is two - instructions from sti] - to flush out pending hardirqs and - IPIs. After this point nothing is supposed to reach this CPU." */ - __asm__ __volatile__("sti; nop; cli"); - barrier(); -#else - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -#endif -} -#endif - diff -uprN linux-2.6.32/arch/x86/kernel/irq_64.c linux-2.6.32.ovz/arch/x86/kernel/irq_64.c --- linux-2.6.32/arch/x86/kernel/irq_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/irq_64.c 2015-03-10 13:21:11.000000000 -0700 @@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_ return true; } -#ifdef CONFIG_HOTPLUG_CPU -/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ -void fixup_irqs(void) -{ - unsigned int irq; - static int warned; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - int break_affinity = 0; - int set_affinity = 1; - const struct cpumask *affinity; - - if (!desc) - continue; - if (irq == 2) - continue; - - /* interrupt's are disabled at this point */ - spin_lock(&desc->lock); - - affinity = desc->affinity; - if (!irq_has_action(irq) || - cpumask_equal(affinity, cpu_online_mask)) { - spin_unlock(&desc->lock); - continue; - } - - if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { - break_affinity = 1; - affinity = cpu_all_mask; - } - - if (desc->chip->mask) - desc->chip->mask(irq); - - if (desc->chip->set_affinity) - desc->chip->set_affinity(irq, affinity); - else if (!(warned++)) - set_affinity = 0; - - if (desc->chip->unmask) - desc->chip->unmask(irq); - - spin_unlock(&desc->lock); - - if (break_affinity && set_affinity) - printk("Broke affinity for irq %i\n", irq); - else if (!set_affinity) - printk("Cannot set affinity for irq %i\n", irq); - } - - /* That doesn't seem sufficient. Give it 1ms. */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif extern void call_softirq(void); diff -uprN linux-2.6.32/arch/x86/kernel/irq.c linux-2.6.32.ovz/arch/x86/kernel/irq.c --- linux-2.6.32/arch/x86/kernel/irq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/irq.c 2015-06-23 04:16:16.000000000 -0700 @@ -15,10 +15,13 @@ #include #include +#define CREATE_TRACE_POINTS +#include + atomic_t irq_err_count; /* Function pointer for generic interrupt vector handling */ -void (*generic_interrupt_extension)(void) = NULL; +void (*x86_platform_ipi_callback)(void) = NULL; /* * 'what should we do if we get a hw irq event on an illegal vector'. @@ -67,15 +70,15 @@ static int show_other_interrupts(struct for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance monitoring interrupts\n"); - seq_printf(p, "%*s: ", prec, "PND"); + seq_printf(p, "%*s: ", prec, "IWI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); - seq_printf(p, " Performance pending work\n"); + seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); + seq_printf(p, " IRQ work interrupts\n"); #endif - if (generic_interrupt_extension) { + if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); for_each_online_cpu(j) - seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); + seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis); seq_printf(p, " Platform interrupts\n"); } #ifdef CONFIG_SMP @@ -185,10 +188,10 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; - sum += irq_stats(cpu)->apic_pending_irqs; + sum += irq_stats(cpu)->apic_irq_work_irqs; #endif - if (generic_interrupt_extension) - sum += irq_stats(cpu)->generic_irqs; + if (x86_platform_ipi_callback) + sum += irq_stats(cpu)->x86_platform_ipis; #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; sum += irq_stats(cpu)->irq_call_count; @@ -251,26 +254,232 @@ unsigned int __irq_entry do_IRQ(struct p } /* - * Handler for GENERIC_INTERRUPT_VECTOR. + * Handler for X86_PLATFORM_IPI_VECTOR. */ -void smp_generic_interrupt(struct pt_regs *regs) +void __smp_x86_platform_ipi(void) +{ + inc_irq_stat(x86_platform_ipis); + + if (x86_platform_ipi_callback) + x86_platform_ipi_callback(); +} + +void smp_x86_platform_ipi(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); - ack_APIC_irq(); + entering_ack_irq(); + __smp_x86_platform_ipi(); + exiting_irq(); + set_irq_regs(old_regs); +} - exit_idle(); +void smp_trace_x86_platform_ipi(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); - irq_enter(); + entering_ack_irq(); + trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); + __smp_x86_platform_ipi(); + trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); + exiting_irq(); + set_irq_regs(old_regs); +} - inc_irq_stat(generic_irqs); +/* + * Handler for MONITOR_IPI_VECTOR + */ +void smp_monitor_ipi(struct pt_regs *regs) +{ + ack_APIC_irq(); +} - if (generic_interrupt_extension) - generic_interrupt_extension(); +void smp_trace_monitor_ipi(struct pt_regs *regs) +{ + trace_monitor_ipi_entry(MONITOR_IPI_VECTOR); + ack_APIC_irq(); + trace_monitor_ipi_exit(MONITOR_IPI_VECTOR); +} - irq_exit(); +/* + * Handler for MONITOR_POSTED_INTERRUPT_VECTOR + */ +void smp_monitor_posted_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); +} - set_irq_regs(old_regs); +void smp_trace_monitor_posted_interrupt(struct pt_regs *regs) +{ + trace_monitor_posted_interrupt_entry(MONITOR_POSTED_INTERRUPT_VECTOR); + ack_APIC_irq(); + trace_monitor_posted_interrupt_exit(MONITOR_POSTED_INTERRUPT_VECTOR); } EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); + +#ifdef CONFIG_HOTPLUG_CPU + +/* These two declarations are only used in check_irq_vectors_for_cpu_disable() + * below, which is protected by stop_machine(). Putting them on the stack + * results in a stack frame overflow. Dynamically allocating could result in a + * failure so declare these two cpumasks as global. + */ +static struct cpumask affinity_new, online_new; + +/* + * This cpu is going to be removed and its vectors migrated to the remaining + * online cpus. Check to see if there are enough vectors in the remaining cpus. + * This function is protected by stop_machine(). + */ +int check_irq_vectors_for_cpu_disable(void) +{ + int irq, cpu; + unsigned int this_cpu, vector, this_count, count; + struct irq_desc *desc; + + this_cpu = smp_processor_id(); + cpumask_copy(&online_new, cpu_online_mask); + cpu_clear(this_cpu, online_new); + + this_count = 0; + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + irq = __get_cpu_var(vector_irq)[vector]; + if (irq >= 0) { + desc = irq_to_desc(irq); + cpumask_copy(&affinity_new, desc->affinity); + cpu_clear(this_cpu, affinity_new); + + /* Do not count inactive or per-cpu irqs. */ + if (!irq_has_action(irq) || + vector_used_by_percpu_irq(vector)) + continue; + + /* + * A single irq may be mapped to multiple + * cpu's vector_irq[] (for example IOAPIC cluster + * mode). In this case we have two + * possibilities: + * + * 1) the resulting affinity mask is empty; that is + * this the down'd cpu is the last cpu in the irq's + * affinity mask, or + * + * 2) the resulting affinity mask is no longer + * a subset of the online cpus but the affinity + * mask is not zero; that is the down'd cpu is the + * last online cpu in a user set affinity mask. + */ + if (cpumask_empty(&affinity_new) || + !cpumask_subset(&affinity_new, &online_new)) + this_count++; + } + } + + count = 0; + for_each_online_cpu(cpu) { + if (cpu == this_cpu) + continue; + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; + vector++) { + if (__get_cpu_var(vector_irq)[vector] < 0) + count++; + } + } + + if (count < this_count) { + pr_warn("CPU %d disable failed: CPU has %u vectors assigned and there are only %u available.\n", + this_cpu, this_count, count); + return -ERANGE; + } + return 0; +} + +/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */ +void fixup_irqs(void) +{ + unsigned int irq, vector; + static int warned; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + int break_affinity = 0; + int set_affinity = 1; + const struct cpumask *affinity; + + if (!desc) + continue; + if (irq == 2) + continue; + + /* interrupt's are disabled at this point */ + spin_lock(&desc->lock); + + affinity = desc->affinity; + if (!irq_has_action(irq) || (desc->status & IRQ_PER_CPU) || + cpumask_equal(affinity, cpu_online_mask)) { + spin_unlock(&desc->lock); + continue; + } + + /* + * Complete the irq move. This cpu is going down and for + * non intr-remapping case, we can't wait till this interrupt + * arrives at this cpu before completing the irq move. + */ + irq_force_complete_move(irq); + + if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { + break_affinity = 1; + affinity = cpu_all_mask; + } + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) + desc->chip->mask(irq); + + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, affinity); + else if (!(warned++)) + set_affinity = 0; + + if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) + desc->chip->unmask(irq); + + spin_unlock(&desc->lock); + + if (break_affinity && set_affinity) + printk("Broke affinity for irq %i\n", irq); + else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + + /* + * We can remove mdelay() and then send spuriuous interrupts to + * new cpu targets for all the irqs that were handled previously by + * this cpu. While it works, I have seen spurious interrupt messages + * (nothing wrong but still...). + * + * So for now, retain mdelay(1) and check the IRR and then send those + * interrupts to new targets as this cpu is already offlined... + */ + mdelay(1); + + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irr; + + if (__get_cpu_var(vector_irq)[vector] < 0) + continue; + + irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); + if (irr & (1 << (vector % 32))) { + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + spin_lock(&desc->lock); + if (desc->chip->retrigger) + desc->chip->retrigger(irq); + spin_unlock(&desc->lock); + } + } +} +#endif diff -uprN linux-2.6.32/arch/x86/kernel/irqinit.c linux-2.6.32.ovz/arch/x86/kernel/irqinit.c --- linux-2.6.32/arch/x86/kernel/irqinit.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/irqinit.c 2015-06-23 04:16:16.000000000 -0700 @@ -118,26 +118,17 @@ int vector_used_by_percpu_irq(unsigned i void __init init_ISA_irqs(void) { + struct irq_chip *chip = legacy_pic->chip; + const char *name = chip->name; int i; #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) init_bsp_APIC(); #endif - init_8259A(0); + legacy_pic->init(0); - /* - * 16 old-style INTA-cycle interrupts: - */ - for (i = 0; i < NR_IRQS_LEGACY; i++) { - struct irq_desc *desc = irq_to_desc(i); - - desc->status = IRQ_DISABLED; - desc->action = NULL; - desc->depth = 1; - - set_irq_chip_and_handler_name(i, &i8259A_chip, - handle_level_irq, "XT"); - } + for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) + set_irq_chip_and_handler_name(i, chip, handle_level_irq, name); } void __init init_IRQ(void) @@ -200,19 +191,21 @@ static void __init apic_intr_init(void) /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - /* generic IPI for platform specific use */ - alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); + /* IPI for X86 platform specific use */ + alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi); /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_EVENTS - alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); + /* IRQ work interrupts: */ +# ifdef CONFIG_IRQ_WORK + alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt); # endif #endif + alloc_intr_gate(MONITOR_IPI_VECTOR, monitor_ipi); + alloc_intr_gate(MONITOR_POSTED_INTERRUPT_VECTOR, monitor_posted_interrupt); } void __init native_init_IRQ(void) diff -uprN linux-2.6.32/arch/x86/kernel/irq_work.c linux-2.6.32.ovz/arch/x86/kernel/irq_work.c --- linux-2.6.32/arch/x86/kernel/irq_work.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/irq_work.c 2015-03-10 13:24:02.000000000 -0700 @@ -0,0 +1,50 @@ +/* + * x86 specific code for irq_work + * + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + */ + +#include +#include +#include +#include +#include + +static inline void irq_work_entering_irq(void) +{ + irq_enter(); + ack_APIC_irq(); +} + +static inline void __smp_irq_work_interrupt(void) +{ + inc_irq_stat(apic_irq_work_irqs); + irq_work_run(); +} + +void smp_irq_work_interrupt(struct pt_regs *regs) +{ + irq_work_entering_irq(); + __smp_irq_work_interrupt(); + exiting_irq(); +} + +void smp_trace_irq_work_interrupt(struct pt_regs *regs) +{ + irq_work_entering_irq(); + trace_irq_work_entry(IRQ_WORK_VECTOR); + __smp_irq_work_interrupt(); + trace_irq_work_exit(IRQ_WORK_VECTOR); + exiting_irq(); +} + +void arch_irq_work_raise(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!cpu_has_apic) + return; + + apic->send_IPI_self(IRQ_WORK_VECTOR); + apic_wait_icr_idle(); +#endif +} diff -uprN linux-2.6.32/arch/x86/kernel/k8.c linux-2.6.32.ovz/arch/x86/kernel/k8.c --- linux-2.6.32/arch/x86/kernel/k8.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/k8.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,123 +0,0 @@ -/* - * Shared support code for AMD K8 northbridges and derivates. - * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. - */ -#include -#include -#include -#include -#include -#include -#include - -int num_k8_northbridges; -EXPORT_SYMBOL(num_k8_northbridges); - -static u32 *flush_words; - -struct pci_device_id k8_nb_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, - {} -}; -EXPORT_SYMBOL(k8_nb_ids); - -struct pci_dev **k8_northbridges; -EXPORT_SYMBOL(k8_northbridges); - -static struct pci_dev *next_k8_northbridge(struct pci_dev *dev) -{ - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_ids[0], dev)); - return dev; -} - -int cache_k8_northbridges(void) -{ - int i; - struct pci_dev *dev; - - if (num_k8_northbridges) - return 0; - - dev = NULL; - while ((dev = next_k8_northbridge(dev)) != NULL) - num_k8_northbridges++; - - k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *), - GFP_KERNEL); - if (!k8_northbridges) - return -ENOMEM; - - if (!num_k8_northbridges) { - k8_northbridges[0] = NULL; - return 0; - } - - flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL); - if (!flush_words) { - kfree(k8_northbridges); - return -ENOMEM; - } - - dev = NULL; - i = 0; - while ((dev = next_k8_northbridge(dev)) != NULL) { - k8_northbridges[i] = dev; - pci_read_config_dword(dev, 0x9c, &flush_words[i++]); - } - k8_northbridges[i] = NULL; - return 0; -} -EXPORT_SYMBOL_GPL(cache_k8_northbridges); - -/* Ignores subdevice/subvendor but as far as I can figure out - they're useless anyways */ -int __init early_is_k8_nb(u32 device) -{ - struct pci_device_id *id; - u32 vendor = device & 0xffff; - device >>= 16; - for (id = k8_nb_ids; id->vendor; id++) - if (vendor == id->vendor && device == id->device) - return 1; - return 0; -} - -void k8_flush_garts(void) -{ - int flushed, i; - unsigned long flags; - static DEFINE_SPINLOCK(gart_lock); - - /* Avoid races between AGP and IOMMU. In theory it's not needed - but I'm not sure if the hardware won't lose flush requests - when another is pending. This whole thing is so expensive anyways - that it doesn't matter to serialize more. -AK */ - spin_lock_irqsave(&gart_lock, flags); - flushed = 0; - for (i = 0; i < num_k8_northbridges; i++) { - pci_write_config_dword(k8_northbridges[i], 0x9c, - flush_words[i]|1); - flushed++; - } - for (i = 0; i < num_k8_northbridges; i++) { - u32 w; - /* Make sure the hardware actually executed the flush*/ - for (;;) { - pci_read_config_dword(k8_northbridges[i], - 0x9c, &w); - if (!(w & 1)) - break; - cpu_relax(); - } - } - spin_unlock_irqrestore(&gart_lock, flags); - if (!flushed) - printk("nothing to flush?\n"); -} -EXPORT_SYMBOL_GPL(k8_flush_garts); - diff -uprN linux-2.6.32/arch/x86/kernel/kgdb.c linux-2.6.32.ovz/arch/x86/kernel/kgdb.c --- linux-2.6.32/arch/x86/kernel/kgdb.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/kgdb.c 2015-03-10 13:22:31.000000000 -0700 @@ -47,6 +47,7 @@ #include #include +#include /* * Put the error code here just in case the user cares: @@ -519,7 +520,7 @@ static struct notifier_block kgdb_notifi /* * Lowest-prio notifier priority, we want to be notified last: */ - .priority = -INT_MAX, + .priority = NMI_LOCAL_LOW_PRIOR, }; /** diff -uprN linux-2.6.32/arch/x86/kernel/kprobes.c linux-2.6.32.ovz/arch/x86/kernel/kprobes.c --- linux-2.6.32/arch/x86/kernel/kprobes.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/kprobes.c 2015-03-10 13:23:16.000000000 -0700 @@ -48,31 +48,22 @@ #include #include #include +#include +#include #include #include #include #include #include +#include void jprobe_return_end(void); DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#ifdef CONFIG_X86_64 -#define stack_addr(regs) ((unsigned long *)regs->sp) -#else -/* - * "®s->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs - * don't save the ss and esp registers if the CPU is already in kernel - * mode when it traps. So for kprobes, regs->sp and regs->ss are not - * the [nonexistent] saved stack pointer and ss register, but rather - * the top 8 bytes of the pre-int3 stack. So ®s->sp happens to - * point to the top of the pre-int3 stack. - */ -#define stack_addr(regs) ((unsigned long *)®s->sp) -#endif +#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs)) #define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ @@ -106,50 +97,6 @@ static const u32 twobyte_is_boostable[25 /* ----------------------------------------------- */ /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ }; -static const u32 onebyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */ - W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */ - W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */ - W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */ - W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */ - W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */ - W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */ - W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */ - W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */ - W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */ - W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */ - W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */ - W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */ - W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */ - W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */ - W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; -static const u32 twobyte_has_modrm[256 / 32] = { - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ - /* ----------------------------------------------- */ - W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */ - W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */ - W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */ - W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */ - W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */ - W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */ - W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */ - W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */ - W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */ - W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */ - W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */ - W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */ - W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */ - W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */ - W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */ - W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */ - /* ----------------------------------------------- */ - /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ -}; #undef W struct kretprobe_blackpoint kretprobe_blacklist[] = { @@ -159,29 +106,41 @@ struct kretprobe_blackpoint kretprobe_bl }; const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); -/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ -static void __kprobes set_jmp_op(void *from, void *to) +static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op) { - struct __arch_jmp_op { - char op; + struct __arch_relative_insn { + u8 op; s32 raddr; - } __attribute__((packed)) * jop; - jop = (struct __arch_jmp_op *)from; - jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); - jop->op = RELATIVEJUMP_INSTRUCTION; + } __attribute__((packed)) *insn; + + insn = (struct __arch_relative_insn *)from; + insn->raddr = (s32)((long)(to) - ((long)(from) + 5)); + insn->op = op; +} + +/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ +static void __kprobes synthesize_reljump(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE); } /* - * Check for the REX prefix which can only exist on X86_64 - * X86_32 always returns 0 + * Skip the prefixes of the instruction. */ -static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) +static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + while (inat_is_legacy_prefix(attr)) { + insn++; + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + } #ifdef CONFIG_X86_64 - if ((*insn & 0xf0) == 0x40) - return 1; + if (inat_is_rex_prefix(attr)) + insn++; #endif - return 0; + return insn; } /* @@ -244,11 +203,142 @@ retry: } } +static unsigned long __recover_probed_insn(kprobe_opcode_t *buf, + unsigned long addr) +{ + struct kprobe *kp; + + kp = get_kprobe((void *)addr); + /* There is no probe, return original address */ + if (!kp) + return addr; + + /* + * Basically, kp->ainsn.insn has an original instruction. + * However, RIP-relative instruction can not do single-stepping + * at different place, __copy_instruction() tweaks the displacement of + * that instruction. In that case, we can't recover the instruction + * from the kp->ainsn.insn. + * + * On the other hand, kp->opcode has a copy of the first byte of + * the probed instruction, which is overwritten by int3. And + * the instruction at kp->addr is not modified by kprobes except + * for the first byte, we can recover the original instruction + * from it and kp->opcode. + */ + memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + buf[0] = kp->opcode; + return (unsigned long)buf; +} + +#ifdef CONFIG_OPTPROBES +static unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, + unsigned long addr) +{ + struct optimized_kprobe *op; + struct kprobe *kp; + long offs; + int i; + + for (i = 0; i < RELATIVEJUMP_SIZE; i++) { + kp = get_kprobe((void *)addr - i); + /* This function only handles jump-optimized kprobe */ + if (kp && kprobe_optimized(kp)) { + op = container_of(kp, struct optimized_kprobe, kp); + /* If op->list is not empty, op is under optimizing */ + if (list_empty(&op->list)) + goto found; + } + } + + return addr; +found: + /* + * If the kprobe can be optimized, original bytes which can be + * overwritten by jump destination address. In this case, original + * bytes must be recovered from op->optinsn.copied_insn buffer. + */ + memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); + if (addr == (unsigned long)kp->addr) { + buf[0] = kp->opcode; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + } else { + offs = addr - (unsigned long)kp->addr - 1; + memcpy(buf, op->optinsn.copied_insn + offs, RELATIVE_ADDR_SIZE - offs); + } + + return (unsigned long)buf; +} +#else +static inline unsigned long __recover_optprobed_insn(kprobe_opcode_t *buf, + unsigned long addr) +{ + return addr; +} +#endif + +/* + * Recover the probed instruction at addr for further analysis. + * Caller must lock kprobes by kprobe_mutex, or disable preemption + * for preventing to release referencing kprobes. + */ +static unsigned long recover_probed_instruction(kprobe_opcode_t *buf, + unsigned long addr) +{ + unsigned long __addr; + + __addr = __recover_optprobed_insn(buf, addr); + if (__addr != addr) + return __addr; + + return __recover_probed_insn(buf, addr); +} + +/* Check if paddr is at an instruction boundary */ +static int __kprobes can_probe(unsigned long paddr) +{ + unsigned long addr, __addr, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + if (!kallsyms_lookup_size_offset(paddr, NULL, &offset)) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr) { + /* + * Check if the instruction has been modified by another + * kprobe, in which case we replace the breakpoint by the + * original instruction in our buffer. + * Also, jump optimization will change the breakpoint to + * relative-jump. Since the relative-jump itself is + * normally used, we just go through if there is no kprobe. + */ + __addr = recover_probed_instruction(buf, addr); + kernel_insn_init(&insn, (void *)__addr); + insn_get_length(&insn); + + /* + * Another debugging subsystem might insert this breakpoint. + * In that case, we can't recover it. + */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; + addr += insn.length; + } + + return (addr == paddr); +} + /* * Returns non-zero if opcode modifies the interrupt flag. */ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) { + /* Skip prefixes */ + insn = skip_prefixes(insn); + switch (*insn) { case 0xfa: /* cli */ case 0xfb: /* sti */ @@ -257,108 +347,80 @@ static int __kprobes is_IF_modifier(kpro return 1; } - /* - * on X86_64, 0x40-0x4f are REX prefixes so we need to look - * at the next byte instead.. but of course not recurse infinitely - */ - if (is_REX_prefix(insn)) - return is_IF_modifier(++insn); - return 0; } /* - * Adjust the displacement if the instruction uses the %rip-relative - * addressing mode. + * Copy an instruction and adjust the displacement if the instruction + * uses the %rip-relative addressing mode. * If it does, Return the address of the 32-bit displacement word. * If not, return null. * Only applicable to 64-bit x86. */ -static void __kprobes fix_riprel(struct kprobe *p) +static int __kprobes __copy_instruction(u8 *dest, u8 *src) { -#ifdef CONFIG_X86_64 - u8 *insn = p->ainsn.insn; - s64 disp; - int need_modrm; - - /* Skip legacy instruction prefixes. */ - while (1) { - switch (*insn) { - case 0x66: - case 0x67: - case 0x2e: - case 0x3e: - case 0x26: - case 0x64: - case 0x65: - case 0x36: - case 0xf0: - case 0xf3: - case 0xf2: - ++insn; - continue; - } - break; - } + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; - /* Skip REX instruction prefix. */ - if (is_REX_prefix(insn)) - ++insn; - - if (*insn == 0x0f) { - /* Two-byte opcode. */ - ++insn; - need_modrm = test_bit(*insn, - (unsigned long *)twobyte_has_modrm); - } else - /* One-byte opcode. */ - need_modrm = test_bit(*insn, - (unsigned long *)onebyte_has_modrm); - - if (need_modrm) { - u8 modrm = *++insn; - if ((modrm & 0xc7) == 0x05) { - /* %rip+disp32 addressing mode */ - /* Displacement follows ModRM byte. */ - ++insn; - /* - * The copied instruction uses the %rip-relative - * addressing mode. Adjust the displacement for the - * difference between the original location of this - * instruction and the location of the copy that will - * actually be run. The tricky bit here is making sure - * that the sign extension happens correctly in this - * calculation, since we need a signed 32-bit result to - * be sign-extended to 64 bits when it's added to the - * %rip value and yield the same 64-bit result that the - * sign-extension of the original signed 32-bit - * displacement would have given. - */ - disp = (u8 *) p->addr + *((s32 *) insn) - - (u8 *) p->ainsn.insn; - BUG_ON((s64) (s32) disp != disp); /* Sanity check. */ - *(s32 *)insn = (s32) disp; - } + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, (unsigned long)src)); + insn_get_length(&insn); + /* Another subsystem puts a breakpoint, failed to recover */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; + memcpy(dest, insn.kaddr, insn.length); + +#ifdef CONFIG_X86_64 + if (insn_rip_relative(&insn)) { + s64 newdisp; + u8 *disp; + kernel_insn_init(&insn, dest); + insn_get_displacement(&insn); + /* + * The copied instruction uses the %rip-relative addressing + * mode. Adjust the displacement for the difference between + * the original location of this instruction and the location + * of the copy that will actually be run. The tricky bit here + * is making sure that the sign extension happens correctly in + * this calculation, since we need a signed 32-bit result to + * be sign-extended to 64 bits when it's added to the %rip + * value and yield the same 64-bit result that the sign- + * extension of the original signed 32-bit displacement would + * have given. + */ + newdisp = (u8 *) src + (s64) insn.displacement.value - (u8 *) dest; + BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */ + disp = (u8 *) dest + insn_offset_displacement(&insn); + *(s32 *) disp = (s32) newdisp; } #endif + return insn.length; } static void __kprobes arch_copy_kprobe(struct kprobe *p) { - memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); - - fix_riprel(p); + /* Copy an instruction with recovering if other optprobe modifies it.*/ + __copy_instruction(p->ainsn.insn, p->addr); - if (can_boost(p->addr)) + /* + * __copy_instruction can modify the displacement of the instruction, + * but it doesn't affect boostable check. + */ + if (can_boost(p->ainsn.insn)) p->ainsn.boostable = 0; else p->ainsn.boostable = -1; - p->opcode = *p->addr; + /* Also, displacement change doesn't affect the first byte */ + p->opcode = p->ainsn.insn[0]; } int __kprobes arch_prepare_kprobe(struct kprobe *p) { + if (alternatives_text_reserved(p->addr, p->addr)) + return -EINVAL; + + if (!can_probe((unsigned long)p->addr)) + return -EILSEQ; /* insn: must be on special executable page on x86. */ p->ainsn.insn = get_insn_slot(); if (!p->ainsn.insn) @@ -413,26 +475,22 @@ static void __kprobes set_current_kprobe static void __kprobes clear_btf(void) { - if (test_thread_flag(TIF_DEBUGCTLMSR)) - update_debugctlmsr(0); + if (test_thread_flag(TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + } } static void __kprobes restore_btf(void) { - if (test_thread_flag(TIF_DEBUGCTLMSR)) - update_debugctlmsr(current->thread.debugctlmsr); -} + if (test_thread_flag(TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); -static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) -{ - clear_btf(); - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; - /* single step inline if the instruction is an int3 */ - if (p->opcode == BREAKPOINT_INSTRUCTION) - regs->ip = (unsigned long)p->addr; - else - regs->ip = (unsigned long)p->ainsn.insn; + debugctl |= DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + } } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, @@ -446,20 +504,50 @@ void __kprobes arch_prepare_kretprobe(st *sara = (unsigned long) &kretprobe_trampoline; } +#ifdef CONFIG_OPTPROBES +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter); +#else +#define setup_detour_execution(p, regs, reenter) (0) +#endif + static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, - struct kprobe_ctlblk *kcb) + struct kprobe_ctlblk *kcb, int reenter) { -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) + if (setup_detour_execution(p, regs, reenter)) + return; + +#if !defined(CONFIG_PREEMPT) if (p->ainsn.boostable == 1 && !p->post_handler) { /* Boost up -- we can execute copied instructions directly */ - reset_current_kprobe(); + if (!reenter) + reset_current_kprobe(); + /* + * Reentering boosted probe doesn't reset current_kprobe, + * nor set current_kprobe, because it doesn't use single + * stepping. + */ regs->ip = (unsigned long)p->ainsn.insn; preempt_enable_no_resched(); return; } #endif - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_HIT_SS; + if (reenter) { + save_previous_kprobe(kcb); + set_current_kprobe(p, regs, kcb); + kcb->kprobe_status = KPROBE_REENTER; + } else + kcb->kprobe_status = KPROBE_HIT_SS; + /* Prepare real single stepping */ + clear_btf(); + regs->flags |= X86_EFLAGS_TF; + regs->flags &= ~X86_EFLAGS_IF; + /* single step inline if the instruction is an int3 */ + if (p->opcode == BREAKPOINT_INSTRUCTION) + regs->ip = (unsigned long)p->addr; + else + regs->ip = (unsigned long)p->ainsn.insn; } /* @@ -472,37 +560,21 @@ static int __kprobes reenter_kprobe(stru { switch (kcb->kprobe_status) { case KPROBE_HIT_SSDONE: -#ifdef CONFIG_X86_64 - /* TODO: Provide re-entrancy from post_kprobes_handler() and - * avoid exception stack corruption while single-stepping on - * the instruction of the new probe. - */ - arch_disarm_kprobe(p); - regs->ip = (unsigned long)p->addr; - reset_current_kprobe(); - preempt_enable_no_resched(); - break; -#endif case KPROBE_HIT_ACTIVE: - save_previous_kprobe(kcb); - set_current_kprobe(p, regs, kcb); kprobes_inc_nmissed_count(p); - prepare_singlestep(p, regs); - kcb->kprobe_status = KPROBE_REENTER; + setup_singlestep(p, regs, kcb, 1); break; case KPROBE_HIT_SS: - if (p == kprobe_running()) { - regs->flags &= ~X86_EFLAGS_TF; - regs->flags |= kcb->kprobe_saved_flags; - return 0; - } else { - /* A probe has been hit in the codepath leading up - * to, or just after, single-stepping of a probed - * instruction. This entire codepath should strictly - * reside in .kprobes.text section. Raise a warning - * to highlight this peculiar case. - */ - } + /* A probe has been hit in the codepath leading up to, or just + * after, single-stepping of a probed instruction. This entire + * codepath should strictly reside in .kprobes.text section. + * Raise a BUG or we'll continue in an endless reentering loop + * and eventually a stack overflow. + */ + printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n", + p->addr); + dump_kprobe(p); + BUG(); default: /* impossible cases */ WARN_ON(1); @@ -523,20 +595,6 @@ static int __kprobes kprobe_handler(stru struct kprobe_ctlblk *kcb; addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); - if (*addr != BREAKPOINT_INSTRUCTION) { - /* - * The breakpoint instruction was removed right - * after we hit it. Another cpu has removed - * either a probepoint or a debugger breakpoint - * at this address. In either case, no further - * handling of this interrupt is appropriate. - * Back up over the (now missing) int3 and run - * the original instruction. - */ - regs->ip = (unsigned long)addr; - return 1; - } - /* * We don't want to be preempted for the entire * duration of kprobe processing. We conditionally @@ -565,13 +623,26 @@ static int __kprobes kprobe_handler(stru * more here. */ if (!p->pre_handler || !p->pre_handler(p, regs)) - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } + } else if (*addr != BREAKPOINT_INSTRUCTION) { + /* + * The breakpoint instruction was removed right + * after we hit it. Another cpu has removed + * either a probepoint or a debugger breakpoint + * at this address. In either case, no further + * handling of this interrupt is appropriate. + * Back up over the (now missing) int3 and run + * the original instruction. + */ + regs->ip = (unsigned long)addr; + preempt_enable_no_resched(); + return 1; } else if (kprobe_running()) { p = __get_cpu_var(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { - setup_singlestep(p, regs, kcb); + setup_singlestep(p, regs, kcb, 0); return 1; } } /* else: not a kprobe fault; let the kernel handle it */ @@ -580,6 +651,69 @@ static int __kprobes kprobe_handler(stru return 0; } +#ifdef CONFIG_X86_64 +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax. */ \ + " subq $24, %rsp\n" \ + " pushq %rdi\n" \ + " pushq %rsi\n" \ + " pushq %rdx\n" \ + " pushq %rcx\n" \ + " pushq %rax\n" \ + " pushq %r8\n" \ + " pushq %r9\n" \ + " pushq %r10\n" \ + " pushq %r11\n" \ + " pushq %rbx\n" \ + " pushq %rbp\n" \ + " pushq %r12\n" \ + " pushq %r13\n" \ + " pushq %r14\n" \ + " pushq %r15\n" +#define RESTORE_REGS_STRING \ + " popq %r15\n" \ + " popq %r14\n" \ + " popq %r13\n" \ + " popq %r12\n" \ + " popq %rbp\n" \ + " popq %rbx\n" \ + " popq %r11\n" \ + " popq %r10\n" \ + " popq %r9\n" \ + " popq %r8\n" \ + " popq %rax\n" \ + " popq %rcx\n" \ + " popq %rdx\n" \ + " popq %rsi\n" \ + " popq %rdi\n" \ + /* Skip orig_ax, ip, cs */ \ + " addq $24, %rsp\n" +#else +#define SAVE_REGS_STRING \ + /* Skip cs, ip, orig_ax and gs. */ \ + " subl $16, %esp\n" \ + " pushl %fs\n" \ + " pushl %es\n" \ + " pushl %ds\n" \ + " pushl %eax\n" \ + " pushl %ebp\n" \ + " pushl %edi\n" \ + " pushl %esi\n" \ + " pushl %edx\n" \ + " pushl %ecx\n" \ + " pushl %ebx\n" +#define RESTORE_REGS_STRING \ + " popl %ebx\n" \ + " popl %ecx\n" \ + " popl %edx\n" \ + " popl %esi\n" \ + " popl %edi\n" \ + " popl %ebp\n" \ + " popl %eax\n" \ + /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\ + " addl $24, %esp\n" +#endif + /* * When a retprobed function returns, this code saves registers and * calls trampoline_handler() runs, which calls the kretprobe's handler. @@ -593,65 +727,16 @@ static void __used __kprobes kretprobe_t /* We don't bother saving the ss register */ " pushq %rsp\n" " pushfq\n" - /* - * Skip cs, ip, orig_ax. - * trampoline_handler() will plug in these values - */ - " subq $24, %rsp\n" - " pushq %rdi\n" - " pushq %rsi\n" - " pushq %rdx\n" - " pushq %rcx\n" - " pushq %rax\n" - " pushq %r8\n" - " pushq %r9\n" - " pushq %r10\n" - " pushq %r11\n" - " pushq %rbx\n" - " pushq %rbp\n" - " pushq %r12\n" - " pushq %r13\n" - " pushq %r14\n" - " pushq %r15\n" + SAVE_REGS_STRING " movq %rsp, %rdi\n" " call trampoline_handler\n" /* Replace saved sp with true return address. */ " movq %rax, 152(%rsp)\n" - " popq %r15\n" - " popq %r14\n" - " popq %r13\n" - " popq %r12\n" - " popq %rbp\n" - " popq %rbx\n" - " popq %r11\n" - " popq %r10\n" - " popq %r9\n" - " popq %r8\n" - " popq %rax\n" - " popq %rcx\n" - " popq %rdx\n" - " popq %rsi\n" - " popq %rdi\n" - /* Skip orig_ax, ip, cs */ - " addq $24, %rsp\n" + RESTORE_REGS_STRING " popfq\n" #else " pushf\n" - /* - * Skip cs, ip, orig_ax and gs. - * trampoline_handler() will plug in these values - */ - " subl $16, %esp\n" - " pushl %fs\n" - " pushl %es\n" - " pushl %ds\n" - " pushl %eax\n" - " pushl %ebp\n" - " pushl %edi\n" - " pushl %esi\n" - " pushl %edx\n" - " pushl %ecx\n" - " pushl %ebx\n" + SAVE_REGS_STRING " movl %esp, %eax\n" " call trampoline_handler\n" /* Move flags to cs */ @@ -659,15 +744,7 @@ static void __used __kprobes kretprobe_t " movl %edx, 52(%esp)\n" /* Replace saved flags with true return address. */ " movl %eax, 56(%esp)\n" - " popl %ebx\n" - " popl %ecx\n" - " popl %edx\n" - " popl %esi\n" - " popl %edi\n" - " popl %ebp\n" - " popl %eax\n" - /* Skip ds, es, fs, gs, orig_ax and ip */ - " addl $24, %esp\n" + RESTORE_REGS_STRING " popf\n" #endif " ret\n"); @@ -683,6 +760,7 @@ static __used __kprobes void *trampoline struct hlist_node *node, *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; + kprobe_opcode_t *correct_ret_addr = NULL; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); @@ -714,14 +792,34 @@ static __used __kprobes void *trampoline /* another task is sharing our hash bucket */ continue; + orig_ret_address = (unsigned long)ri->ret_addr; + + if (orig_ret_address != trampoline_address) + /* + * This is the real return address. Any other + * instances associated with this task are for + * other calls deeper on the call stack + */ + break; + } + + kretprobe_assert(ri, orig_ret_address, trampoline_address); + + correct_ret_addr = ri->ret_addr; + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task != current) + /* another task is sharing our hash bucket */ + continue; + + orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __get_cpu_var(current_kprobe) = &ri->rp->kp; get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; + ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); __get_cpu_var(current_kprobe) = NULL; } - orig_ret_address = (unsigned long)ri->ret_addr; recycle_rp_inst(ri, &empty_rp); if (orig_ret_address != trampoline_address) @@ -733,8 +831,6 @@ static __used __kprobes void *trampoline break; } - kretprobe_assert(ri, orig_ret_address, trampoline_address); - kretprobe_hash_unlock(current, &flags); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { @@ -779,9 +875,8 @@ static void __kprobes resume_execution(s unsigned long orig_ip = (unsigned long)p->addr; kprobe_opcode_t *insn = p->ainsn.insn; - /*skip the REX prefix*/ - if (is_REX_prefix(insn)) - insn++; + /* Skip prefixes */ + insn = skip_prefixes(insn); regs->flags &= ~X86_EFLAGS_TF; switch (*insn) { @@ -835,8 +930,8 @@ static void __kprobes resume_execution(s * These instructions can be executed directly if it * jumps back to correct address. */ - set_jmp_op((void *)regs->ip, - (void *)orig_ip + (regs->ip - copy_ip)); + synthesize_reljump((void *)regs->ip, + (void *)orig_ip + (regs->ip - copy_ip)); p->ainsn.boostable = 1; } else { p->ainsn.boostable = -1; @@ -1057,6 +1152,353 @@ int __kprobes longjmp_break_handler(stru return 0; } + +#ifdef CONFIG_OPTPROBES + +/* Insert a call instruction at address 'from', which calls address 'to'.*/ +static void __kprobes synthesize_relcall(void *from, void *to) +{ + __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE); +} + +/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */ +static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr, + unsigned long val) +{ +#ifdef CONFIG_X86_64 + *addr++ = 0x48; + *addr++ = 0xbf; +#else + *addr++ = 0xb8; +#endif + *(unsigned long *)addr = val; +} + +static void __used __kprobes kprobes_optinsn_template_holder(void) +{ + asm volatile ( + ".global optprobe_template_entry\n" + "optprobe_template_entry: \n" +#ifdef CONFIG_X86_64 + /* We don't bother saving the ss register */ + " pushq %rsp\n" + " pushfq\n" + SAVE_REGS_STRING + " movq %rsp, %rsi\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + /* Move flags to rsp */ + " movq 144(%rsp), %rdx\n" + " movq %rdx, 152(%rsp)\n" + RESTORE_REGS_STRING + /* Skip flags entry */ + " addq $8, %rsp\n" + " popfq\n" +#else /* CONFIG_X86_32 */ + " pushf\n" + SAVE_REGS_STRING + " movl %esp, %edx\n" + ".global optprobe_template_val\n" + "optprobe_template_val: \n" + ASM_NOP5 + ".global optprobe_template_call\n" + "optprobe_template_call: \n" + ASM_NOP5 + RESTORE_REGS_STRING + " addl $4, %esp\n" /* skip cs */ + " popf\n" +#endif + ".global optprobe_template_end\n" + "optprobe_template_end: \n"); +} + +#define TMPL_MOVE_IDX \ + ((long)&optprobe_template_val - (long)&optprobe_template_entry) +#define TMPL_CALL_IDX \ + ((long)&optprobe_template_call - (long)&optprobe_template_entry) +#define TMPL_END_IDX \ + ((long)&optprobe_template_end - (long)&optprobe_template_entry) + +#define INT3_SIZE sizeof(kprobe_opcode_t) + +/* Optimized kprobe call back function: called from optinsn */ +static void __kprobes optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; + + local_irq_save(flags); + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + /* Save skipped registers */ +#ifdef CONFIG_X86_64 + regs->cs = __KERNEL_CS; +#else + regs->cs = __KERNEL_CS | get_kernel_rpl(); + regs->gs = 0; +#endif + regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; + regs->orig_ax = ~0UL; + + __get_cpu_var(current_kprobe) = &op->kp; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __get_cpu_var(current_kprobe) = NULL; + } + local_irq_restore(flags); +} + +static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) +{ + int len = 0, ret; + + while (len < RELATIVEJUMP_SIZE) { + ret = __copy_instruction(dest + len, src + len); + if (!ret || !can_boost(dest + len)) + return -EINVAL; + len += ret; + } + /* Check whether the address range is reserved */ + if (ftrace_text_reserved(src, src + len - 1) || + alternatives_text_reserved(src, src + len - 1)) + return -EBUSY; + + return len; +} + +/* Check whether insn is indirect jump */ +static int __kprobes insn_is_indirect_jump(struct insn *insn) +{ + return ((insn->opcode.bytes[0] == 0xff && + (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */ + insn->opcode.bytes[0] == 0xea); /* Segment based jump */ +} + +/* Check whether insn jumps into specified address range */ +static int insn_jump_into_range(struct insn *insn, unsigned long start, int len) +{ + unsigned long target = 0; + + switch (insn->opcode.bytes[0]) { + case 0xe0: /* loopne */ + case 0xe1: /* loope */ + case 0xe2: /* loop */ + case 0xe3: /* jcxz */ + case 0xe9: /* near relative jump */ + case 0xeb: /* short relative jump */ + break; + case 0x0f: + if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */ + break; + return 0; + default: + if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */ + break; + return 0; + } + target = (unsigned long)insn->next_byte + insn->immediate.value; + + return (start <= target && target <= start + len); +} + +/* Decode whole function to ensure any instructions don't jump into target */ +static int __kprobes can_optimize(unsigned long paddr) +{ + unsigned long addr, size = 0, offset = 0; + struct insn insn; + kprobe_opcode_t buf[MAX_INSN_SIZE]; + + /* Lookup symbol including addr */ + if (!kallsyms_lookup_size_offset(paddr, &size, &offset)) + return 0; + + /* Check there is enough space for a relative jump. */ + if (size - offset < RELATIVEJUMP_SIZE) + return 0; + + /* Decode instructions */ + addr = paddr - offset; + while (addr < paddr - offset + size) { /* Decode until function end */ + if (search_exception_tables(addr)) + /* + * Since some fixup code will jumps into this function, + * we can't optimize kprobe in this function. + */ + return 0; + kernel_insn_init(&insn, (void *)recover_probed_instruction(buf, addr)); + insn_get_length(&insn); + /* Another subsystem puts a breakpoint */ + if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) + return 0; + /* Recover address */ + insn.kaddr = (void *)addr; + insn.next_byte = (void *)(addr + insn.length); + /* Check any instructions don't jump into target */ + if (insn_is_indirect_jump(&insn) || + insn_jump_into_range(&insn, paddr + INT3_SIZE, + RELATIVE_ADDR_SIZE)) + return 0; + addr += insn.length; + } + + return 1; +} + +/* Check optimized_kprobe can actually be optimized. */ +int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + int i; + struct kprobe *p; + + for (i = 1; i < op->optinsn.size; i++) { + p = get_kprobe(op->kp.addr + i); + if (p && !kprobe_disabled(p)) + return -EEXIST; + } + + return 0; +} + +/* Check the addr is within the optimized instructions. */ +int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + op->optinsn.size > addr); +} + +/* Free optimized instruction slot */ +static __kprobes +void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty) +{ + if (op->optinsn.insn) { + free_optinsn_slot(op->optinsn.insn, dirty); + op->optinsn.insn = NULL; + op->optinsn.size = 0; + } +} + +void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + __arch_remove_optimized_kprobe(op, 1); +} + +/* + * Copy replacing target instructions + * Target instructions MUST be relocatable (checked inside) + * This is called when new aggr(opt)probe is allocated or reused. + */ +int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op) +{ + u8 *buf; + int ret; + long rel; + + if (!can_optimize((unsigned long)op->kp.addr)) + return -EILSEQ; + + op->optinsn.insn = get_optinsn_slot(); + if (!op->optinsn.insn) + return -ENOMEM; + + /* + * Verify if the address gap is in 2GB range, because this uses + * a relative jump. + */ + rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE; + if (abs(rel) > 0x7fffffff) + return -ERANGE; + + buf = (u8 *)op->optinsn.insn; + + /* Copy instructions into the out-of-line buffer */ + ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr); + if (ret < 0) { + __arch_remove_optimized_kprobe(op, 0); + return ret; + } + op->optinsn.size = ret; + + /* Copy arch-dep-instance from template */ + memcpy(buf, &optprobe_template_entry, TMPL_END_IDX); + + /* Set probe information */ + synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op); + + /* Set probe function call */ + synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback); + + /* Set returning jmp instruction at the tail of out-of-line buffer */ + synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size, + (u8 *)op->kp.addr + op->optinsn.size); + + flush_icache_range((unsigned long) buf, + (unsigned long) buf + TMPL_END_IDX + + op->optinsn.size + RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a breakpoint (int3) with a relative jump. */ +int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) +{ + unsigned char jmp_code[RELATIVEJUMP_SIZE]; + s32 rel = (s32)((long)op->optinsn.insn - + ((long)op->kp.addr + RELATIVEJUMP_SIZE)); + + /* Backup instructions which will be replaced by jump address */ + memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, + RELATIVE_ADDR_SIZE); + + jmp_code[0] = RELATIVEJUMP_OPCODE; + *(s32 *)(&jmp_code[1]) = rel; + + /* + * text_poke_smp doesn't support NMI/MCE code modifying. + * However, since kprobes itself also doesn't support NMI/MCE + * code probing, it's not a problem. + */ + text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); + return 0; +} + +/* Replace a relative jump with a breakpoint (int3). */ +void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + u8 buf[RELATIVEJUMP_SIZE]; + + /* Set int3 to first byte for kprobes */ + buf[0] = BREAKPOINT_INSTRUCTION; + memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE); + text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE); +} + +static int __kprobes setup_detour_execution(struct kprobe *p, + struct pt_regs *regs, + int reenter) +{ + struct optimized_kprobe *op; + + if (p->flags & KPROBE_FLAG_OPTIMIZED) { + /* This kprobe is really able to run optimized path. */ + op = container_of(p, struct optimized_kprobe, kp); + /* Detour through copied instructions */ + regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX; + if (!reenter) + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + return 0; +} +#endif + int __init arch_init_kprobes(void) { return 0; diff -uprN linux-2.6.32/arch/x86/kernel/kvm.c linux-2.6.32.ovz/arch/x86/kernel/kvm.c --- linux-2.6.32/arch/x86/kernel/kvm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/kvm.c 2015-03-10 13:23:54.000000000 -0700 @@ -27,7 +27,13 @@ #include #include #include +#include +#include #include +#include +#include +#include +#include #define MMU_QUEUE_SIZE 1024 @@ -43,6 +49,9 @@ static struct kvm_para_state *kvm_para_s return &per_cpu(para_state, raw_smp_processor_id()); } +static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64); +static int has_steal_clock = 0; + /* * No need for any "IO delay" on KVM */ @@ -195,6 +204,22 @@ static void kvm_leave_lazy_mmu(void) paravirt_leave_lazy_mmu(); } +static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; + +static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +{ + /** + * This relies on __test_and_clear_bit to modify the memory + * in a way that is atomic with respect to the local CPU. + * The hypervisor only accesses this memory from the local CPU so + * there's no need for lock or memory barriers. + * An optimization barrier is implied in apic write. + */ + if (__test_and_clear_bit(KVM_PV_EOI_BIT, &__get_cpu_var(kvm_apic_eoi))) + return; + apic_write(APIC_EOI, APIC_EOI_ACK); +} + static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; @@ -231,10 +256,173 @@ static void __init paravirt_ops_setup(vo #endif } +static void kvm_register_steal_time(void) +{ + int cpu = smp_processor_id(); + struct kvm_steal_time *st = &per_cpu(steal_time, cpu); + + if (!has_steal_clock) + return; + + memset(st, 0, sizeof(*st)); + + wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); + printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", + cpu, __pa(st)); +} + +void __cpuinit kvm_guest_cpu_init(void) +{ + if (!kvm_para_available()) + return; + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { + unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ + BUILD_BUG_ON(__alignof__(per_cpu_var(kvm_apic_eoi)) < 4); + __get_cpu_var(kvm_apic_eoi) = 0; + pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + wrmsrl(MSR_KVM_PV_EOI_EN, pa); + } + if (has_steal_clock) + kvm_register_steal_time(); +} + +static void kvm_pv_guest_cpu_reboot(void *unused) +{ + /* + * We disable PV EOI before we load a new kernel by kexec, + * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory. + * New kernel can re-enable when it boots. + */ + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); +} + +static int kvm_pv_reboot_notify(struct notifier_block *nb, + unsigned long code, void *unused) +{ + if (code == SYS_RESTART) + on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1); + return NOTIFY_DONE; +} + +static struct notifier_block kvm_pv_reboot_nb = { + .notifier_call = kvm_pv_reboot_notify, +}; + +static u64 kvm_steal_clock(int cpu) +{ + u64 steal; + struct kvm_steal_time *src; + int version; + + src = &per_cpu(steal_time, cpu); + do { + version = src->version; + rmb(); + steal = src->steal; + rmb(); + } while ((version & 1) || (version != src->version)); + + return steal; +} + +void kvm_disable_steal_time(void) +{ + if (!has_steal_clock) + return; + + wrmsr(MSR_KVM_STEAL_TIME, 0, 0); +} + +#ifdef CONFIG_SMP +static void __init kvm_smp_prepare_boot_cpu(void) +{ +#ifdef CONFIG_KVM_CLOCK + WARN_ON(kvm_register_clock("primary cpu clock")); +#endif + kvm_guest_cpu_init(); + native_smp_prepare_boot_cpu(); +} + +static void __cpuinit kvm_guest_cpu_online(void *dummy) +{ + kvm_guest_cpu_init(); +} + +static void kvm_guest_cpu_offline(void *dummy) +{ + kvm_disable_steal_time(); + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + wrmsrl(MSR_KVM_PV_EOI_EN, 0); +} + +static int __cpuinit kvm_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (unsigned long)hcpu; + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata kvm_cpu_notifier = { + .notifier_call = kvm_cpu_notify, +}; +#endif + +static bool __init kvm_detect(void) +{ + if (!kvm_para_available()) + return false; + return true; +} + +static bool __init kvm_x2apic_available(void) +{ + return kvm_para_available(); +} + +const struct hypervisor_x86 x86_hyper_kvm __refconst = { + .name = "KVM", + .detect = kvm_detect, + .x2apic_available = kvm_x2apic_available, +}; +EXPORT_SYMBOL_GPL(x86_hyper_kvm); + void __init kvm_guest_init(void) { if (!kvm_para_available()) return; paravirt_ops_setup(); + register_reboot_notifier(&kvm_pv_reboot_nb); + + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + has_steal_clock = 1; + pv_time_ops.steal_clock = kvm_steal_clock; + paravirt_steal_enabled = true; + } + + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) + apic_set_eoi_write(kvm_guest_apic_eoi_write); + +#ifdef CONFIG_SMP + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + register_cpu_notifier(&kvm_cpu_notifier); +#else + kvm_guest_cpu_init(); +#endif } diff -uprN linux-2.6.32/arch/x86/kernel/kvmclock.c linux-2.6.32.ovz/arch/x86/kernel/kvmclock.c --- linux-2.6.32/arch/x86/kernel/kvmclock.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/kvmclock.c 2015-03-10 13:23:40.000000000 -0700 @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -29,6 +30,8 @@ #define KVM_SCALE 22 static int kvmclock = 1; +static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static int parse_no_kvmclock(char *arg) { @@ -54,7 +57,8 @@ static unsigned long kvm_get_wallclock(v low = (int)__pa_symbol(&wall_clock); high = ((u64)__pa_symbol(&wall_clock) >> 32); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); + + native_write_msr(msr_kvm_wall_clock, low, high); vcpu_time = &get_cpu_var(hv_clock); pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); @@ -112,6 +116,20 @@ static void kvm_get_preset_lpj(void) preset_lpj = lpj; } +bool kvm_check_and_clear_guest_paused(void) +{ + bool ret = false; + struct pvclock_vcpu_time_info *src; + + src = &__get_cpu_var(hv_clock); + if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { + percpu_and(hv_clock.flags, ~PVCLOCK_GUEST_STOPPED); + ret = true; + } + + return ret; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_get_cycles, @@ -122,7 +140,7 @@ static struct clocksource kvm_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static int kvm_register_clock(char *txt) +int kvm_register_clock(char *txt) { int cpu = smp_processor_id(); int low, high; @@ -130,7 +148,17 @@ static int kvm_register_clock(char *txt) high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); - return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); + + return native_write_msr_safe(msr_kvm_system_time, low, high); +} + +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); } #ifdef CONFIG_X86_LOCAL_APIC @@ -141,16 +169,6 @@ static void __cpuinit kvm_setup_secondar * we shouldn't fail. */ WARN_ON(kvm_register_clock("secondary cpu clock")); - /* ok, done with our trickery, call native */ - setup_secondary_APIC_clock(); -} -#endif - -#ifdef CONFIG_SMP -static void __init kvm_smp_prepare_boot_cpu(void) -{ - WARN_ON(kvm_register_clock("primary cpu clock")); - native_smp_prepare_boot_cpu(); } #endif @@ -165,14 +183,16 @@ static void __init kvm_smp_prepare_boot_ #ifdef CONFIG_KEXEC static void kvm_crash_shutdown(struct pt_regs *regs) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_crash_shutdown(regs); } #endif static void kvm_shutdown(void) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); + kvm_disable_steal_time(); native_machine_shutdown(); } @@ -181,27 +201,36 @@ void __init kvmclock_init(void) if (!kvm_para_available()) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { - if (kvm_register_clock("boot clock")) - return; - pv_time_ops.sched_clock = kvm_clock_read; - x86_platform.calibrate_tsc = kvm_get_tsc_khz; - x86_platform.get_wallclock = kvm_get_wallclock; - x86_platform.set_wallclock = kvm_set_wallclock; + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) + return; + + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + + if (kvm_register_clock("boot clock")) + return; + pv_time_ops.sched_clock = kvm_clock_read; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = - kvm_setup_secondary_clock; + x86_cpuinit.early_percpu_clock_init = + kvm_setup_secondary_clock; #endif -#ifdef CONFIG_SMP - smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; -#endif - machine_ops.shutdown = kvm_shutdown; + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; + machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC - machine_ops.crash_shutdown = kvm_crash_shutdown; + machine_ops.crash_shutdown = kvm_crash_shutdown; #endif - kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); - pv_info.paravirt_enabled = 1; - pv_info.name = "KVM"; - } + kvm_get_preset_lpj(); + clocksource_register(&kvm_clock); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); } diff -uprN linux-2.6.32/arch/x86/kernel/ldt.c linux-2.6.32.ovz/arch/x86/kernel/ldt.c --- linux-2.6.32/arch/x86/kernel/ldt.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/ldt.c 2015-06-23 04:16:16.000000000 -0700 @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include @@ -39,9 +41,9 @@ static int alloc_ldt(mm_context_t *pc, i mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) & (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1)); if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE) - newldt = vmalloc(mincount * LDT_ENTRY_SIZE); + newldt = ub_vmalloc(mincount * LDT_ENTRY_SIZE); else - newldt = (void *)__get_free_page(GFP_KERNEL); + newldt = (void *)__get_free_page(GFP_KERNEL_UBC); if (!newldt) return -ENOMEM; @@ -117,6 +119,7 @@ int init_new_context(struct task_struct } return retval; } +EXPORT_SYMBOL(init_new_context); /* * No need to lock the MM as we are the last user diff -uprN linux-2.6.32/arch/x86/kernel/machine_kexec_32.c linux-2.6.32.ovz/arch/x86/kernel/machine_kexec_32.c --- linux-2.6.32/arch/x86/kernel/machine_kexec_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/machine_kexec_32.c 2015-03-10 13:23:18.000000000 -0700 @@ -212,7 +212,7 @@ void machine_kexec(struct kimage *image) * one form or other. kexec jump path also need * one. */ - disable_IO_APIC(); + disable_IO_APIC(0); #endif } diff -uprN linux-2.6.32/arch/x86/kernel/machine_kexec_64.c linux-2.6.32.ovz/arch/x86/kernel/machine_kexec_64.c --- linux-2.6.32/arch/x86/kernel/machine_kexec_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/machine_kexec_64.c 2015-03-10 13:23:18.000000000 -0700 @@ -292,7 +292,7 @@ void machine_kexec(struct kimage *image) * one form or other. kexec jump path also need * one. */ - disable_IO_APIC(); + disable_IO_APIC(0); #endif } diff -uprN linux-2.6.32/arch/x86/kernel/Makefile linux-2.6.32.ovz/arch/x86/kernel/Makefile --- linux-2.6.32/arch/x86/kernel/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/Makefile 2015-06-23 04:16:16.000000000 -0700 @@ -13,6 +13,8 @@ CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_ftrace.o = -pg CFLAGS_REMOVE_early_printk.o = -pg +CFLAGS_REMOVE_kvmclock.o = -pg +CFLAGS_REMOVE_pvclock.o = -pg endif # @@ -29,12 +31,15 @@ GCOV_PROFILE_hpet.o := n GCOV_PROFILE_tsc.o := n GCOV_PROFILE_paravirt.o := n +CFLAGS_irq.o := -I$(src)/../include/asm/trace + obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o obj-y += setup.o x86_init.o i8259.o irqinit.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o -obj-$(CONFIG_X86_32) += probe_roms_32.o +obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o @@ -42,13 +47,13 @@ obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o obj-y += tsc.o io_delay.o rtc.o +obj-y += resource.o +obj-y += cpuid_fault.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o -obj-$(CONFIG_X86_DS) += ds.o -obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o @@ -75,12 +80,13 @@ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o +obj-$(CONFIG_X86_TSC) += trace_clock.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_MODULES) += module.o -obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o efi_early_printk.o obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o @@ -88,7 +94,7 @@ obj-$(CONFIG_EARLY_PRINTK) += early_prin obj-$(CONFIG_HPET_TIMER) += hpet.o -obj-$(CONFIG_K8_NB) += k8.o +obj-$(CONFIG_AMD_NB) += amd_nb.o obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o @@ -116,6 +122,9 @@ obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_TRACING) += tracepoint.o + +obj-$(CONFIG_PERF_EVENTS) += perf_regs.o ### # 64 bit specific files diff -uprN linux-2.6.32/arch/x86/kernel/microcode_amd.c linux-2.6.32.ovz/arch/x86/kernel/microcode_amd.c --- linux-2.6.32/arch/x86/kernel/microcode_amd.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/microcode_amd.c 2015-03-10 13:24:04.000000000 -0700 @@ -33,6 +33,8 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 +const struct firmware *firmware; + struct equiv_cpu_entry { u32 installed_cpu; u32 fixed_errata_mask; @@ -63,7 +65,6 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define UCODE_MAX_SIZE 2048 #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 @@ -103,18 +104,11 @@ static int get_matching_microcode(int cp i++; } - if (!equiv_cpu_id) { - printk(KERN_WARNING "microcode: CPU%d: cpu revision " - "not listed in equivalent cpu table\n", cpu); + if (!equiv_cpu_id) return 0; - } - if (mc_header->processor_rev_id != equiv_cpu_id) { - printk(KERN_ERR "microcode: CPU%d: patch mismatch " - "(processor_rev_id: %x, equiv_cpu_id: %x)\n", - cpu, mc_header->processor_rev_id, equiv_cpu_id); + if (mc_header->processor_rev_id != equiv_cpu_id) return 0; - } /* ucode might be chipset specific -- currently we don't support this */ if (mc_header->nb_dev_id || mc_header->sb_dev_id) { @@ -129,6 +123,37 @@ static int get_matching_microcode(int cp return 1; } +static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int max_size, actual_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + actual_size = buf[4] + (buf[5] << 8); + + if (actual_size > size || actual_size > max_size) { + pr_err("section size mismatch\n"); + return 0; + } + + return actual_size; +} + static int apply_microcode_amd(int cpu) { u32 rev, dummy; @@ -150,6 +175,7 @@ static int apply_microcode_amd(int cpu) if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); + uci->cpu_sig.rev = rev; return -1; } @@ -168,11 +194,11 @@ static int get_ucode_data(void *to, cons } static void * -get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) +get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) { - unsigned int total_size; + unsigned int actual_size = 0; u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; - void *mc; + void *mc = NULL; if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) return NULL; @@ -183,26 +209,18 @@ get_next_ucode(const u8 *buf, unsigned i return NULL; } - total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); - - printk(KERN_DEBUG "microcode: size %u, total_size %u\n", - size, total_size); + actual_size = verify_ucode_size(cpu, buf, size); + if (!actual_size) + return NULL; - if (total_size > size || total_size > UCODE_MAX_SIZE) { - printk(KERN_ERR "microcode: error: size mismatch\n"); + mc = vmalloc(actual_size); + if (!mc) return NULL; - } - mc = vmalloc(UCODE_MAX_SIZE); - if (mc) { - memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, - total_size)) { - vfree(mc); - mc = NULL; - } else - *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; - } + memset(mc, 0, actual_size); + get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size); + *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR; + return mc; } @@ -271,7 +289,7 @@ generic_load_microcode(int cpu, const u8 unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; - mc = get_next_ucode(ucode_ptr, leftover, &mc_size); + mc = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); if (!mc) break; @@ -308,14 +326,10 @@ generic_load_microcode(int cpu, const u8 static enum ucode_state request_microcode_fw(int cpu, struct device *device) { - const char *fw_name = "amd-ucode/microcode_amd.bin"; - const struct firmware *firmware; enum ucode_state ret; - if (request_firmware(&firmware, fw_name, device)) { - printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); + if (firmware == NULL) return UCODE_NFOUND; - } if (*(u32 *)firmware->data != UCODE_MAGIC) { printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", @@ -325,16 +339,12 @@ static enum ucode_state request_microcod ret = generic_load_microcode(cpu, firmware->data, firmware->size); - release_firmware(firmware); - return ret; } static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { - printk(KERN_INFO "microcode: AMD microcode update via " - "/dev/cpu/microcode not supported\n"); return UCODE_ERROR; } @@ -346,7 +356,43 @@ static void microcode_fini_cpu_amd(int c uci->mc = NULL; } +/* + * AMD microcode firmware naming convention, up to family 15h they are in + * the legacy file: + * + * amd-ucode/microcode_amd.bin + * + * This legacy file is always smaller than 2K in size. + * + * Starting at family 15h they are in family specific firmware files: + * + * amd-ucode/microcode_amd_fam15h.bin + * amd-ucode/microcode_amd_fam16h.bin + * ... + * + * These might be larger than 2K. + */ +void init_microcode_amd(struct device *device) +{ + char fw_name[36] = "amd-ucode/microcode_amd.bin"; + + if (boot_cpu_data.x86 >= 0x15) + snprintf(fw_name, sizeof(fw_name), + "amd-ucode/microcode_amd_fam%.2xh.bin", + boot_cpu_data.x86); + + if (request_firmware(&firmware, (const char *)fw_name, device)) + printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); +} + +void fini_microcode_amd(void) +{ + release_firmware(firmware); +} + static struct microcode_ops microcode_amd_ops = { + .init = init_microcode_amd, + .fini = fini_microcode_amd, .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, diff -uprN linux-2.6.32/arch/x86/kernel/microcode_core.c linux-2.6.32.ovz/arch/x86/kernel/microcode_core.c --- linux-2.6.32/arch/x86/kernel/microcode_core.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/microcode_core.c 2015-03-10 13:24:07.000000000 -0700 @@ -83,6 +83,7 @@ #include #include +#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -272,7 +273,6 @@ static int reload_for_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; - mutex_lock(µcode_mutex); if (uci->valid) { enum ucode_state ustate; @@ -283,7 +283,6 @@ static int reload_for_cpu(int cpu) if (ustate == UCODE_ERROR) err = -EINVAL; } - mutex_unlock(µcode_mutex); return err; } @@ -303,8 +302,12 @@ static ssize_t reload_store(struct sys_d if (val == 1) { get_online_cpus(); + mutex_lock(µcode_mutex); if (cpu_online(cpu)) ret = reload_for_cpu(cpu); + if (!ret) + perf_check_microcode(); + mutex_unlock(µcode_mutex); put_online_cpus(); } @@ -438,18 +441,9 @@ static int mc_sysdev_resume(struct sys_d int cpu = dev->id; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - if (!cpu_online(cpu)) + if (cpu != smp_processor_id()) return 0; - /* - * All non-bootup cpus are still disabled, - * so only CPU 0 will apply ucode here. - * - * Moreover, there can be no concurrent - * updates from any other places at this point. - */ - WARN_ON(cpu != 0); - if (uci->valid && uci->mc) microcode_ops->apply_microcode(cpu); @@ -520,11 +514,15 @@ static int __init microcode_init(void) return PTR_ERR(microcode_pdev); } + if (microcode_ops->init) + microcode_ops->init(µcode_pdev->dev); + get_online_cpus(); mutex_lock(µcode_mutex); error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); - + if (!error) + perf_check_microcode(); mutex_unlock(µcode_mutex); put_online_cpus(); @@ -563,6 +561,9 @@ static void __exit microcode_exit(void) platform_device_unregister(microcode_pdev); + if (microcode_ops->fini) + microcode_ops->fini(); + microcode_ops = NULL; pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); diff -uprN linux-2.6.32/arch/x86/kernel/microcode_intel.c linux-2.6.32.ovz/arch/x86/kernel/microcode_intel.c --- linux-2.6.32/arch/x86/kernel/microcode_intel.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/microcode_intel.c 2015-03-10 13:24:07.000000000 -0700 @@ -159,12 +159,7 @@ static int collect_cpu_info(int cpu_num, csig->pf = 1 << ((val[1] >> 18) & 7); } - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - + csig->rev = c->microcode; printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", cpu_num, csig->sig, csig->pf, csig->rev); @@ -302,9 +297,9 @@ static int apply_microcode(int cpu) struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; unsigned int val[2]; - int cpu_num; + int cpu_num = raw_smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu_num); - cpu_num = raw_smp_processor_id(); uci = ucode_cpu_info + cpu; mc_intel = uci->mc; @@ -320,7 +315,7 @@ static int apply_microcode(int cpu) (unsigned long) mc_intel->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ + /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); /* get the current revision from MSR 0x8B */ @@ -340,6 +335,7 @@ static int apply_microcode(int cpu) (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + c->microcode = val[1]; return 0; } @@ -348,10 +344,11 @@ static enum ucode_state generic_load_mic int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc; + u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; enum ucode_state state = UCODE_OK; + unsigned int curr_mc_size = 0; while (leftover) { struct microcode_header_intel mc_header; @@ -367,9 +364,15 @@ static enum ucode_state generic_load_mic break; } - mc = vmalloc(mc_size); - if (!mc) - break; + /* For performance reasons, reuse mc area when possible */ + if (!mc || mc_size > curr_mc_size) { + if (mc) + vfree(mc); + mc = vmalloc(mc_size); + if (!mc) + break; + curr_mc_size = mc_size; + } if (get_ucode_data(mc, ucode_ptr, mc_size) || microcode_sanity_check(mc) < 0) { @@ -382,13 +385,16 @@ static enum ucode_state generic_load_mic vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; - } else - vfree(mc); + mc = NULL; /* trigger new vmalloc */ + } ucode_ptr += mc_size; leftover -= mc_size; } + if (mc) + vfree(mc); + if (leftover) { if (new_mc) vfree(new_mc); diff -uprN linux-2.6.32/arch/x86/kernel/mmconf-fam10h_64.c linux-2.6.32.ovz/arch/x86/kernel/mmconf-fam10h_64.c --- linux-2.6.32/arch/x86/kernel/mmconf-fam10h_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/mmconf-fam10h_64.c 2015-03-10 13:23:52.000000000 -0700 @@ -7,6 +7,8 @@ #include #include #include +#include + #include #include #include @@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_p { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, }; -struct range { - u64 start; - u64 end; -}; - static int __cpuinit cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; diff -uprN linux-2.6.32/arch/x86/kernel/module.c linux-2.6.32.ovz/arch/x86/kernel/module.c --- linux-2.6.32/arch/x86/kernel/module.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/module.c 2015-03-10 13:22:50.000000000 -0700 @@ -58,12 +58,40 @@ void module_free(struct module *mod, voi vfree(module_region); } -/* We don't need anything special. */ int module_frob_arch_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { + Elf_Shdr *s, *alt = NULL; + unsigned long diff; + + /* RHEL: A bug exists where the ELF headers for kernel modules + * are incorrectly calculated. The result is that the + * .altinstructions section size is not a multiple of struct alt_instr. + * The fixes introduced in BZs 696457 and 737753 result in the usage of + * the missing memory and when it is used the result is memory + * corruption. This simple fix calculates how much missing memory + * there is and adjusts the .altinstructions section size by the + * appropriate amount. As a result module loading no longer + * corrupts memory. + */ + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) + if (!strcmp(".altinstructions", secstrings + s->sh_name)) + alt = s; + + if (!alt) + return 0; + + diff = ((unsigned long)alt->sh_size % (sizeof(struct alt_instr))); + if (!diff) + return 0; + + DEBUGP("%s: .alternatives size is off by 0x%lx ... adjusting\n", + __FUNCTION__, (sizeof(struct alt_instr)) - diff); + /* fix it up */ + alt->sh_size = alt->sh_size + (sizeof(struct alt_instr)) - diff; + return 0; } @@ -206,7 +234,7 @@ int module_finalize(const Elf_Ehdr *hdr, struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; + *para = NULL, *rhel = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -218,12 +246,32 @@ int module_finalize(const Elf_Ehdr *hdr, locks = s; if (!strcmp(".parainstructions", secstrings + s->sh_name)) para = s; + if (!strcmp(".rheldata", secstrings + s->sh_name)) + rhel = s; } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); + int fixup = 0; + + /* + * RHEL6: We had to modify the alt_instr struct's cpuid + * field so that we could account for future X86 cpu features. + * + * In order to protect KABI we must modify the values in + * modules 6.1 or older that maybe loaded at this time. + */ + + if (!rhel) { + /* modules older than 6.2 do not have .rheldata */ + printk(KERN_DEBUG + "%s module is older than RHEL 6.2 ... applying fixups\n", + me->name); + fixup = 1; + } + + apply_alternatives(aseg, aseg + alt->sh_size, fixup); } if (locks && text) { void *lseg = (void *)locks->sh_addr; diff -uprN linux-2.6.32/arch/x86/kernel/mpparse.c linux-2.6.32.ovz/arch/x86/kernel/mpparse.c --- linux-2.6.32/arch/x86/kernel/mpparse.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/mpparse.c 2015-03-10 13:24:05.000000000 -0700 @@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mp printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } -static int bad_ioapic(unsigned long address) -{ - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); - return 1; - } - return 0; -} - static void __init MP_ioapic_info(struct mpc_ioapic *m) { if (!(m->flags & MPC_APIC_USABLE)) @@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", m->apicid, m->apicver, m->apicaddr); - if (bad_ioapic(m->apicaddr)) - return; - - mp_ioapics[nr_ioapics].apicaddr = m->apicaddr; - mp_ioapics[nr_ioapics].apicid = m->apicid; - mp_ioapics[nr_ioapics].type = m->type; - mp_ioapics[nr_ioapics].apicver = m->apicver; - mp_ioapics[nr_ioapics].flags = m->flags; - nr_ioapics++; + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); } static void print_MP_intsrc_info(struct mpc_intsrc *m) @@ -297,6 +274,18 @@ static void __init smp_dump_mptable(stru void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } +static void __init smp_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, address); + if (boot_cpu_physical_apicid == -1U) { + boot_cpu_physical_apicid = read_apic_id(); + apic_version[boot_cpu_physical_apicid] = + GET_APIC_VERSION(apic_read(APIC_LVR)); + } +} + static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) { char str[16]; @@ -318,6 +307,10 @@ static int __init smp_read_mpc(struct mp if (early) return 1; + /* Initialize the lapic mapping */ + if (!acpi_lapic) + smp_register_lapic_address(mpc->lapic); + if (mpc->oemptr) x86_init.mpparse.smp_read_mpc_oem(mpc); @@ -359,13 +352,6 @@ static int __init smp_read_mpc(struct mp x86_init.mpparse.mpc_record(1); } -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); - if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff -uprN linux-2.6.32/arch/x86/kernel/msr.c linux-2.6.32.ovz/arch/x86/kernel/msr.c --- linux-2.6.32/arch/x86/kernel/msr.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/msr.c 2015-03-10 13:23:51.000000000 -0700 @@ -172,10 +172,13 @@ static long msr_ioctl(struct file *file, static int msr_open(struct inode *inode, struct file *file) { - unsigned int cpu = iminor(file->f_path.dentry->d_inode); - struct cpuinfo_x86 *c = &cpu_data(cpu); + unsigned int cpu; + struct cpuinfo_x86 *c; int ret = 0; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + lock_kernel(); cpu = iminor(file->f_path.dentry->d_inode); @@ -234,7 +237,7 @@ static int __cpuinit msr_class_cpu_callb msr_device_destroy(cpu); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __refdata msr_class_cpu_notifier = { @@ -251,7 +254,7 @@ static int __init msr_init(void) int i, err = 0; i = 0; - if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { + if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { printk(KERN_ERR "msr: unable to get major %d for msr\n", MSR_MAJOR); err = -EBUSY; @@ -279,7 +282,7 @@ out_class: msr_device_destroy(i); class_destroy(msr_class); out_chrdev: - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); out: return err; } @@ -290,7 +293,7 @@ static void __exit msr_exit(void) for_each_online_cpu(cpu) msr_device_destroy(cpu); class_destroy(msr_class); - unregister_chrdev(MSR_MAJOR, "cpu/msr"); + __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); unregister_hotcpu_notifier(&msr_class_cpu_notifier); } diff -uprN linux-2.6.32/arch/x86/kernel/paravirt.c linux-2.6.32.ovz/arch/x86/kernel/paravirt.c --- linux-2.6.32/arch/x86/kernel/paravirt.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/paravirt.c 2015-03-10 13:23:53.000000000 -0700 @@ -202,6 +202,13 @@ static void native_flush_tlb_single(unsi __native_flush_tlb_single(addr); } +bool paravirt_steal_enabled; + +static u64 native_steal_clock(int cpu) +{ + return 0; +} + /* These are in entry.S */ extern void native_iret(void); extern void native_irq_enable_sysexit(void); @@ -231,6 +238,9 @@ static DEFINE_PER_CPU(enum paravirt_lazy static inline void enter_lazy(enum paravirt_lazy_mode mode) { + if (in_interrupt()) + return; + BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); percpu_write(paravirt_lazy_mode, mode); @@ -238,6 +248,9 @@ static inline void enter_lazy(enum parav static void leave_lazy(enum paravirt_lazy_mode mode) { + if (in_interrupt()) + return; + BUG_ON(percpu_read(paravirt_lazy_mode) != mode); percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); @@ -307,6 +320,7 @@ struct pv_init_ops pv_init_ops = { struct pv_time_ops pv_time_ops = { .sched_clock = native_sched_clock, + .steal_clock = native_steal_clock, }; struct pv_irq_ops pv_irq_ops = { @@ -345,6 +359,9 @@ struct pv_cpu_ops pv_cpu_ops = { .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = native_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = native_load_gdt, .load_idt = native_load_idt, .store_gdt = native_store_gdt, @@ -422,16 +439,15 @@ struct pv_mmu_ops pv_mmu_ops = { .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, + .set_pmd_at = native_set_pmd_at, .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, + .pmd_update = paravirt_nop, + .pmd_update_defer = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = kmap_atomic, -#endif - #if PAGETABLE_LEVELS >= 3 #ifdef CONFIG_X86_PAE .set_pte_atomic = native_set_pte_atomic, diff -uprN linux-2.6.32/arch/x86/kernel/pci-calgary_64.c linux-2.6.32.ovz/arch/x86/kernel/pci-calgary_64.c --- linux-2.6.32/arch/x86/kernel/pci-calgary_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/pci-calgary_64.c 2015-03-10 13:22:00.000000000 -0700 @@ -102,11 +102,16 @@ int use_calgary __read_mostly = 0; #define PMR_SOFTSTOPFAULT 0x40000000 #define PMR_HARDSTOP 0x20000000 -#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ -#define MAX_NUM_CHASSIS 8 /* max number of chassis */ -/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) -#define PHBS_PER_CALGARY 4 +/* + * The maximum PHB bus number. + * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384 + * x3950M2: 4 chassis, 48 PHBs per chassis = 192 + * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 + * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 + */ +#define MAX_PHB_BUS_NUM 256 + +#define PHBS_PER_CALGARY 4 /* register offsets in Calgary's internal register space */ static const unsigned long tar_offsets[] = { @@ -318,13 +323,15 @@ static inline struct iommu_table *find_i pdev = to_pci_dev(dev); + /* search up the device tree for an iommu */ pbus = pdev->bus; - - /* is the device behind a bridge? Look for the root bus */ - while (pbus->parent) + do { + tbl = pci_iommu(pbus); + if (tbl && tbl->it_busno == pbus->number) + break; + tbl = NULL; pbus = pbus->parent; - - tbl = pci_iommu(pbus); + } while (pbus); BUG_ON(tbl && (tbl->it_busno != pbus->number)); @@ -1051,8 +1058,6 @@ static int __init calgary_init_one(struc struct iommu_table *tbl; int ret; - BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); - bbar = busno_to_bbar(dev->bus->number); ret = calgary_setup_tar(dev, bbar); if (ret) diff -uprN linux-2.6.32/arch/x86/kernel/pci-dma.c linux-2.6.32.ovz/arch/x86/kernel/pci-dma.c --- linux-2.6.32/arch/x86/kernel/pci-dma.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/pci-dma.c 2015-03-10 13:23:18.000000000 -0700 @@ -214,7 +214,7 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "allowdac", 8)) forbid_dac = 0; if (!strncmp(p, "nodac", 5)) - forbid_dac = -1; + forbid_dac = 1; if (!strncmp(p, "usedac", 6)) { forbid_dac = -1; return 1; @@ -316,10 +316,11 @@ rootfs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + if (forbid_dac == 0) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); forbid_dac = 1; } } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); #endif diff -uprN linux-2.6.32/arch/x86/kernel/pci-gart_64.c linux-2.6.32.ovz/arch/x86/kernel/pci-gart_64.c --- linux-2.6.32/arch/x86/kernel/pci-gart_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/pci-gart_64.c 2015-03-10 13:22:22.000000000 -0700 @@ -38,7 +38,7 @@ #include #include #include -#include +#include static unsigned long iommu_bus_base; /* GART remapping area (physical) */ static unsigned long iommu_size; /* size of remapping area bytes */ @@ -76,6 +76,9 @@ static u32 gart_unmapped_entry; #define AGPEXTERN #endif +/* GART can only remap to physical addresses < 1TB */ +#define GART_MAX_PHYS_ADDR (1ULL << 40) + /* backdoor interface to AGP driver */ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; @@ -138,7 +141,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { - k8_flush_garts(); + amd_flush_garts(); need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); @@ -207,9 +210,13 @@ static dma_addr_t dma_map_area(struct de size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); - unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); + unsigned long iommu_page; int i; + if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR)) + return bad_dma_address; + + iommu_page = alloc_iommu(dev, npages, align_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; @@ -548,11 +555,17 @@ static void enable_gart_translations(voi { int i; - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; enable_gart_translation(dev, __pa(agp_gatt_table)); } + + /* Flush the GART-TLB to remove stale entries */ + amd_flush_garts(); } /* @@ -574,20 +587,22 @@ static int gart_resume(struct sys_device { printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); + if (!amd_nb_has_feature(AMD_NB_GART)) + return 0; + if (fix_up_north_bridges) { int i; printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); - for (i = 0; i < num_k8_northbridges; i++) { - struct pci_dev *dev = k8_northbridges[i]; + for (i = 0; i < amd_nb_num(); i++) { + struct pci_dev *dev = node_to_amd_nb(i)->misc; /* * Don't enable translations just yet. That is the next * step. Restore the pre-suspend aperture settings. */ - pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, - aperture_order << 1); + gart_set_size_and_enable(dev, aperture_order); pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); } @@ -619,7 +634,7 @@ static struct sys_device device_gart = { * Private Northbridge GATT initialization in case we cannot use the * AGP driver for some reason. */ -static __init int init_k8_gatt(struct agp_kern_info *info) +static __init int init_amd_gatt(struct agp_kern_info *info) { unsigned aper_size, gatt_size, new_aper_size; unsigned aper_base, new_aper_base; @@ -630,8 +645,8 @@ static __init int init_k8_gatt(struct ag printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; dev = NULL; - for (i = 0; i < num_k8_northbridges; i++) { - dev = k8_northbridges[i]; + for (i = 0; i < amd_nb_num(); i++) { + dev = node_to_amd_nb(i)->misc; new_aper_base = read_aperture(dev, &new_aper_size); if (!new_aper_base) goto nommu; @@ -696,10 +711,13 @@ void gart_iommu_shutdown(void) if (no_agp && (dma_ops != &gart_dma_ops)) return; - for (i = 0; i < num_k8_northbridges; i++) { + if (!amd_nb_has_feature(AMD_NB_GART)) + return; + + for (i = 0; i < amd_nb_num(); i++) { u32 ctl; - dev = k8_northbridges[i]; + dev = node_to_amd_nb(i)->misc; pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); ctl &= ~GARTEN; @@ -717,14 +735,14 @@ void __init gart_iommu_init(void) unsigned long scratch; long i; - if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) + if (!amd_nb_has_feature(AMD_NB_GART)) return; #ifndef CONFIG_AGP_AMD64 no_agp = 1; #else /* Makefile puts PCI initialization via subsys_initcall first. */ - /* Add other K8 AGP bridge drivers here */ + /* Add other AMD AGP bridge drivers here */ no_agp = no_agp || (agp_amd64_init() < 0) || (agp_copy_info(agp_bridge, &info) < 0); @@ -740,7 +758,7 @@ void __init gart_iommu_init(void) if (no_iommu || (!force_iommu && max_pfn <= MAX_DMA32_PFN) || !gart_iommu_aperture || - (no_agp && init_k8_gatt(&info) < 0)) { + (no_agp && init_amd_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { printk(KERN_WARNING "More than 4GB of memory " "but GART IOMMU not available.\n"); @@ -856,7 +874,7 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) + if (!strncmp(p, "fullflush", 9)) iommu_fullflush = 1; if (!strncmp(p, "nofullflush", 11)) iommu_fullflush = 0; diff -uprN linux-2.6.32/arch/x86/kernel/perf_regs.c linux-2.6.32.ovz/arch/x86/kernel/perf_regs.c --- linux-2.6.32/arch/x86/kernel/perf_regs.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/perf_regs.c 2015-03-10 13:24:07.000000000 -0700 @@ -0,0 +1,105 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_32 +#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX +#else +#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX +#endif + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { + PT_REGS_OFFSET(PERF_REG_X86_AX, ax), + PT_REGS_OFFSET(PERF_REG_X86_BX, bx), + PT_REGS_OFFSET(PERF_REG_X86_CX, cx), + PT_REGS_OFFSET(PERF_REG_X86_DX, dx), + PT_REGS_OFFSET(PERF_REG_X86_SI, si), + PT_REGS_OFFSET(PERF_REG_X86_DI, di), + PT_REGS_OFFSET(PERF_REG_X86_BP, bp), + PT_REGS_OFFSET(PERF_REG_X86_SP, sp), + PT_REGS_OFFSET(PERF_REG_X86_IP, ip), + PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), + PT_REGS_OFFSET(PERF_REG_X86_CS, cs), + PT_REGS_OFFSET(PERF_REG_X86_SS, ss), +#ifdef CONFIG_X86_32 + PT_REGS_OFFSET(PERF_REG_X86_DS, ds), + PT_REGS_OFFSET(PERF_REG_X86_ES, es), + PT_REGS_OFFSET(PERF_REG_X86_FS, fs), + PT_REGS_OFFSET(PERF_REG_X86_GS, gs), +#else + /* + * The pt_regs struct does not store + * ds, es, fs, gs in 64 bit mode. + */ + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, + (unsigned int) -1, +#endif +#ifdef CONFIG_X86_64 + PT_REGS_OFFSET(PERF_REG_X86_R8, r8), + PT_REGS_OFFSET(PERF_REG_X86_R9, r9), + PT_REGS_OFFSET(PERF_REG_X86_R10, r10), + PT_REGS_OFFSET(PERF_REG_X86_R11, r11), + PT_REGS_OFFSET(PERF_REG_X86_R12, r12), + PT_REGS_OFFSET(PERF_REG_X86_R13, r13), + PT_REGS_OFFSET(PERF_REG_X86_R14, r14), + PT_REGS_OFFSET(PERF_REG_X86_R15, r15), +#endif +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE(idx > ARRAY_SIZE(pt_regs_offset))) + return 0; + + return regs_get_register(regs, pt_regs_offset[idx]); +} + +#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) + +#ifdef CONFIG_X86_32 +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + return PERF_SAMPLE_REGS_ABI_32; +} +#else /* CONFIG_X86_64 */ +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ + (1ULL << PERF_REG_X86_ES) | \ + (1ULL << PERF_REG_X86_FS) | \ + (1ULL << PERF_REG_X86_GS)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + if (mask & REG_NOSUPPORT) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_IA32)) + return PERF_SAMPLE_REGS_ABI_32; + else + return PERF_SAMPLE_REGS_ABI_64; +} +#endif /* CONFIG_X86_32 */ diff -uprN linux-2.6.32/arch/x86/kernel/probe_roms_32.c linux-2.6.32.ovz/arch/x86/kernel/probe_roms_32.c --- linux-2.6.32/arch/x86/kernel/probe_roms_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/probe_roms_32.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,166 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#include -#include -#include -#include -#include -#include - -static struct resource system_rom_resource = { - .name = "System ROM", - .start = 0xf0000, - .end = 0xfffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource extension_rom_resource = { - .name = "Extension ROM", - .start = 0xe0000, - .end = 0xeffff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -static struct resource adapter_rom_resources[] = { { - .name = "Adapter ROM", - .start = 0xc8000, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}, { - .name = "Adapter ROM", - .start = 0, - .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -} }; - -static struct resource video_rom_resource = { - .name = "Video ROM", - .start = 0xc0000, - .end = 0xc7fff, - .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM -}; - -#define ROMSIGNATURE 0xaa55 - -static int __init romsignature(const unsigned char *rom) -{ - const unsigned short * const ptr = (const unsigned short *)rom; - unsigned short sig; - - return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; -} - -static int __init romchecksum(const unsigned char *rom, unsigned long length) -{ - unsigned char sum, c; - - for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) - sum += c; - return !length && !sum; -} - -void __init probe_roms(void) -{ - const unsigned char *rom; - unsigned long start, length, upper; - unsigned char c; - int i; - - /* video rom */ - upper = adapter_rom_resources[0].start; - for (start = video_rom_resource.start; start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - video_rom_resource.start = start; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* if checksum okay, trust length byte */ - if (length && romchecksum(rom, length)) - video_rom_resource.end = start + length - 1; - - request_resource(&iomem_resource, &video_rom_resource); - break; - } - - start = (video_rom_resource.end + 1 + 2047) & ~2047UL; - if (start < upper) - start = upper; - - /* system rom */ - request_resource(&iomem_resource, &system_rom_resource); - upper = system_rom_resource.start; - - /* check for extension rom (ignore length byte!) */ - rom = isa_bus_to_virt(extension_rom_resource.start); - if (romsignature(rom)) { - length = extension_rom_resource.end - extension_rom_resource.start + 1; - if (romchecksum(rom, length)) { - request_resource(&iomem_resource, &extension_rom_resource); - upper = extension_rom_resource.start; - } - } - - /* check for adapter roms on 2k boundaries */ - for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { - rom = isa_bus_to_virt(start); - if (!romsignature(rom)) - continue; - - if (probe_kernel_address(rom + 2, c) != 0) - continue; - - /* 0 < length <= 0x7f * 512, historically */ - length = c * 512; - - /* but accept any length that fits if checksum okay */ - if (!length || start + length > upper || !romchecksum(rom, length)) - continue; - - adapter_rom_resources[i].start = start; - adapter_rom_resources[i].end = start + length - 1; - request_resource(&iomem_resource, &adapter_rom_resources[i]); - - start = adapter_rom_resources[i++].end & ~2047UL; - } -} - diff -uprN linux-2.6.32/arch/x86/kernel/probe_roms.c linux-2.6.32.ovz/arch/x86/kernel/probe_roms.c --- linux-2.6.32/arch/x86/kernel/probe_roms.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/probe_roms.c 2015-03-10 13:22:14.000000000 -0700 @@ -0,0 +1,267 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include +#include +#include + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .end = 0xfffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .end = 0xeffff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +static struct resource adapter_rom_resources[] = { { + .name = "Adapter ROM", + .start = 0xc8000, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}, { + .name = "Adapter ROM", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +} }; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .end = 0xc7fff, + .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM +}; + +/* does this oprom support the given pci device, or any of the devices + * that the driver supports? + */ +static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device) +{ + struct pci_driver *drv = pdev->driver; + const struct pci_device_id *id; + + if (pdev->vendor == vendor && pdev->device == device) + return true; + + for (id = drv ? drv->id_table : NULL; id && id->vendor; id++) + if (id->vendor == vendor && id->device == device) + break; + + return id && id->vendor; +} + +static bool probe_list(struct pci_dev *pdev, unsigned short vendor, + const unsigned char *rom_list) +{ + unsigned short device; + + do { + if (probe_kernel_address(rom_list, device) != 0) + device = 0; + + if (device && match_id(pdev, vendor, device)) + break; + + rom_list += 2; + } while (device); + + return !!device; +} + +static struct resource *find_oprom(struct pci_dev *pdev) +{ + struct resource *oprom = NULL; + int i; + + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) { + struct resource *res = &adapter_rom_resources[i]; + unsigned short offset, vendor, device, list, rev; + const unsigned char *rom; + + if (res->end == 0) + break; + + rom = isa_bus_to_virt(res->start); + if (probe_kernel_address(rom + 0x18, offset) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x4, vendor) != 0) + continue; + + if (probe_kernel_address(rom + offset + 0x6, device) != 0) + continue; + + if (match_id(pdev, vendor, device)) { + oprom = res; + break; + } + + if (probe_kernel_address(rom + offset + 0x8, list) == 0 && + probe_kernel_address(rom + offset + 0xc, rev) == 0 && + rev >= 3 && list && + probe_list(pdev, vendor, rom + offset + list)) { + oprom = res; + break; + } + } + + return oprom; +} + +void *pci_map_biosrom(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + if (!oprom) + return NULL; + + return ioremap(oprom->start, resource_size(oprom)); +} +EXPORT_SYMBOL(pci_map_biosrom); + +void pci_unmap_biosrom(void __iomem *image) +{ + iounmap(image); +} +EXPORT_SYMBOL(pci_unmap_biosrom); + +size_t pci_biosrom_size(struct pci_dev *pdev) +{ + struct resource *oprom = find_oprom(pdev); + + return oprom ? resource_size(oprom) : 0; +} +EXPORT_SYMBOL(pci_biosrom_size); + +#define ROMSIGNATURE 0xaa55 + +static int __init romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + + return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; +} + +static int __init romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) + sum += c; + return !length && !sum; +} + +void __init probe_roms(void) +{ + const unsigned char *rom; + unsigned long start, length, upper; + unsigned char c; + int i; + + /* video rom */ + upper = adapter_rom_resources[0].start; + for (start = video_rom_resource.start; start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + + request_resource(&iomem_resource, &video_rom_resource); + break; + } + + start = (video_rom_resource.end + 1 + 2047) & ~2047UL; + if (start < upper) + start = upper; + + /* system rom */ + request_resource(&iomem_resource, &system_rom_resource); + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) { + request_resource(&iomem_resource, &extension_rom_resource); + upper = extension_rom_resource.start; + } + } + + /* check for adapter roms on 2k boundaries */ + for (i = 0; i < ARRAY_SIZE(adapter_rom_resources) && start < upper; start += 2048) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_kernel_address(rom + 2, c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + adapter_rom_resources[i].start = start; + adapter_rom_resources[i].end = start + length - 1; + request_resource(&iomem_resource, &adapter_rom_resources[i]); + + start = adapter_rom_resources[i++].end & ~2047UL; + } +} + diff -uprN linux-2.6.32/arch/x86/kernel/process_32.c linux-2.6.32.ovz/arch/x86/kernel/process_32.c --- linux-2.6.32/arch/x86/kernel/process_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/process_32.c 2015-06-23 04:16:16.000000000 -0700 @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -35,11 +34,11 @@ #include #include #include -#include #include #include #include #include +#include #include #include @@ -57,9 +56,11 @@ #include #include #include -#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); +EXPORT_SYMBOL(ret_from_fork); +asmlinkage void i386_ret_from_resume(void) __asm__("i386_ret_from_resume"); +EXPORT_SYMBOL_GPL(i386_ret_from_resume); /* * Return saved PC of a blocked thread. @@ -127,7 +128,6 @@ void __show_regs(struct pt_regs *regs, i unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; - const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -139,27 +139,19 @@ void __show_regs(struct pt_regs *regs, i savesegment(gs, gs); } - printk("\n"); + show_regs_common(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", - task_pid_nr(current), current->comm, - print_tainted(), init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); - - printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", + printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, smp_processor_id()); - print_symbol("EIP is at %s\n", regs->ip); + if (decode_call_traces) + print_symbol("EIP is at %s\n", regs->ip); - printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); - printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", + printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", regs->si, regs->di, regs->bp, sp); - printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", + printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); if (!all) @@ -169,26 +161,28 @@ void __show_regs(struct pt_regs *regs, i cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4_safe(); - printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", + printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); get_debugreg(d3, 3); - printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", + printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", d0, d1, d2, d3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR6: %08lx DR7: %08lx\n", + printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n", d6, d7); } void show_regs(struct pt_regs *regs) { - __show_regs(regs, 1); - show_trace(NULL, regs, ®s->sp, regs->bp); + show_registers(regs); + show_trace(NULL, regs, ®s->sp); + if (!decode_call_traces) + printk(" EIP: [<%08lx>]\n", regs->ip); } /* @@ -197,6 +191,7 @@ void show_regs(struct pt_regs *regs) * the "args". */ extern void kernel_thread_helper(void); +EXPORT_SYMBOL_GPL(kernel_thread_helper); /* * Create a kernel thread @@ -205,6 +200,13 @@ int kernel_thread(int (*fn)(void *), voi { struct pt_regs regs; + /* Don't allow kernel_thread() inside VE */ + if (!ve_allow_kthreads && !ve_is_super(get_exec_env())) { + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; + } + memset(®s, 0, sizeof(regs)); regs.bx = (unsigned long) fn; @@ -283,20 +285,16 @@ int copy_thread(unsigned long clone_flag kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } - - clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); - p->thread.ds_ctx = NULL; - - clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); - p->thread.debugctlmsr = 0; - return err; } void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { + int cpu; + set_user_gs(regs, 0); + regs->fs = 0; set_fs(USER_DS); regs->ds = __USER_DS; @@ -305,6 +303,11 @@ start_thread(struct pt_regs *regs, unsig regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; + + cpu = get_cpu(); + load_user_cs_desc(cpu, current->mm); + put_cpu(); + /* * Free the old FP and other extended state */ @@ -364,6 +367,9 @@ __switch_to(struct task_struct *prev_p, if (preload_fpu) prefetch(next->xstate); + if (next_p->mm) + load_user_cs_desc(cpu, next_p->mm); + /* * Reload esp0. */ @@ -451,13 +457,13 @@ int sys_clone(struct pt_regs *regs) int sys_execve(struct pt_regs *regs) { int error; - char *filename; + struct filename *filename; filename = getname((char __user *) regs->bx); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; - error = do_execve(filename, + error = do_execve(filename->name, (char __user * __user *) regs->cx, (char __user * __user *) regs->dx, regs); @@ -497,3 +503,40 @@ unsigned long get_wchan(struct task_stru return 0; } +static void modify_cs(struct mm_struct *mm, unsigned long limit) +{ + mm->context.exec_limit = limit; + set_user_cs(&mm->context.user_cs, limit); + if (mm == current->mm) { + int cpu; + + cpu = get_cpu(); + load_user_cs_desc(cpu, mm); + put_cpu(); + } +} + +void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) +{ + if (limit > mm->context.exec_limit) + modify_cs(mm, limit); +} + +void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) +{ + struct vm_area_struct *vma; + unsigned long limit = PAGE_SIZE; + + if (old_end == mm->context.exec_limit) { + for (vma = mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + modify_cs(mm, limit); + } +} + +void arch_flush_exec_range(struct mm_struct *mm) +{ + mm->context.exec_limit = 0; + set_user_cs(&mm->context.user_cs, 0); +} diff -uprN linux-2.6.32/arch/x86/kernel/process_64.c linux-2.6.32.ovz/arch/x86/kernel/process_64.c --- linux-2.6.32/arch/x86/kernel/process_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/process_64.c 2015-06-23 04:16:16.000000000 -0700 @@ -25,8 +25,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -38,7 +38,6 @@ #include #include #include -#include #include #include @@ -51,9 +50,11 @@ #include #include #include -#include -asmlinkage extern void ret_from_fork(void); + +asmlinkage void kernel_execve(const char *filename, char *const argv[], + char *const envp[]) __asm__ ("kernel_execve"); +EXPORT_SYMBOL(kernel_execve); DEFINE_PER_CPU(unsigned long, old_rsp); static DEFINE_PER_CPU(unsigned char, is_idle); @@ -162,31 +163,22 @@ void __show_regs(struct pt_regs *regs, i unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex, gsindex; unsigned int ds, cs, es; - const char *board; - printk("\n"); - print_modules(); - board = dmi_get_system_info(DMI_PRODUCT_NAME); - if (!board) - board = ""; - printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version, board); - printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); - printk_address(regs->ip, 1); - printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + show_regs_common(); + printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + if (decode_call_traces) + printk_address(regs->ip, 1); + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, regs->flags); - printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); - printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); @@ -207,28 +199,29 @@ void __show_regs(struct pt_regs *regs, i cr3 = read_cr3(); cr4 = read_cr4(); - printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) { - printk(KERN_INFO "CPU %d:", smp_processor_id()); - __show_regs(regs, 1); - show_trace(NULL, regs, (void *)(regs + 1), regs->bp); + show_registers(regs); + show_trace(NULL, regs, ®s->sp); + if (!decode_call_traces) + printk(" EIP: [<%08lx>]\n", regs->ip); } void release_thread(struct task_struct *dead_task) @@ -295,11 +288,10 @@ int copy_thread(unsigned long clone_flag set_tsk_thread_flag(p, TIF_FORK); - p->thread.fs = me->thread.fs; - p->thread.gs = me->thread.gs; - savesegment(gs, p->thread.gsindex); + p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs; savesegment(fs, p->thread.fsindex); + p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs; savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); @@ -328,13 +320,6 @@ int copy_thread(unsigned long clone_flag if (err) goto out; } - - clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); - p->thread.ds_ctx = NULL; - - clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); - p->thread.debugctlmsr = 0; - err = 0; out: if (err && p->thread.io_bitmap_ptr) { @@ -396,24 +381,9 @@ __switch_to(struct task_struct *prev_p, if (preload_fpu) prefetch(next->xstate); - /* - * Reload esp0, LDT and the page table pointer: - */ + /* Reload esp0 and ss1. */ load_sp0(tss, next); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -422,6 +392,10 @@ __switch_to(struct task_struct *prev_p, savesegment(fs, fsindex); savesegment(gs, gsindex); + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ load_TLS(next, cpu); /* Must be after DS reload */ @@ -432,27 +406,82 @@ __switch_to(struct task_struct *prev_p, clts(); /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up - * to date. + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them, and and it must + * be done before math_state_restore, so the TS bit is up to + * date. */ arch_end_context_switch(next_p); + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + /* * Switch FS and GS. * - * Segment register != 0 always requires a reload. Also - * reload when it has changed. When prev process used 64bit - * base always reload to avoid an information leak. + * These are even more complicated than FS and GS: they have + * 64-bit bases are that controlled by arch_prctl. Those bases + * only differ from the values in the GDT or LDT if the selector + * is 0. + * + * Loading the segment register resets the hidden base part of + * the register to 0 or the value from the GDT / LDT. If the + * next base address zero, writing 0 to the segment register is + * much faster than using wrmsr to explicitly zero the base. + * + * The thread_struct.fs and thread_struct.gs values are 0 + * if the fs and gs bases respectively are not overridden + * from the values implied by fsindex and gsindex. They + * are nonzero, and store the nonzero base addresses, if + * the bases are overridden. + * + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should + * be impossible. + * + * Therefore we need to reload the segment registers if either + * the old or new selector is nonzero, and we need to override + * the base address if next thread expects it to be overridden. + * + * This code is unnecessarily slow in the case where the old and + * new indexes are zero and the new base is nonzero -- it will + * unnecessarily write 0 to the selector before writing the new + * base address. + * + * Note: This all depends on arch_prctl being the only way that + * user code can override the segment base. Once wrfsbase and + * wrgsbase are enabled, most of this code will need to change. */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); + /* - * Check if the user used a selector != 0; if yes - * clear 64bit base, since overloaded base is always - * mapped to the Null selector + * If user code wrote a nonzero value to FS, then it also + * cleared the overridden base address. + * + * XXX: if user code wrote 0 to FS and cleared the base + * address itself, we won't notice and we'll incorrectly + * restore the prior base address next time we reschdule + * the process. */ if (fsindex) prev->fs = 0; @@ -464,6 +493,8 @@ __switch_to(struct task_struct *prev_p, if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); + + /* This works (and fails) the same way as fsindex above. */ if (gsindex) prev->gs = 0; } @@ -506,13 +537,13 @@ long sys_execve(char __user *name, char char __user * __user *envp, struct pt_regs *regs) { long error; - char *filename; + struct filename *filename; filename = getname(name); error = PTR_ERR(filename); if (IS_ERR(filename)) return error; - error = do_execve(filename, argv, envp, regs); + error = do_execve(filename->name, argv, envp, regs); putname(filename); return error; } @@ -524,6 +555,9 @@ void set_personality_64bit(void) /* Make sure to be in 64bit mode */ clear_thread_flag(TIF_IA32); + /* Ensure the corresponding mm is not marked. */ + clear_bit(MMF_COMPAT, ¤t->mm->flags); + /* TBD: overwrites user setup. Should have two bits. But 64bit processes have always behaved this way, so it's not too bad. The main problem is just that @@ -540,6 +574,21 @@ sys_clone(unsigned long clone_flags, uns return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); } +void set_personality_ia32(void) +{ + /* inherit personality from parent */ + + /* Make sure to be in 32bit mode */ + set_thread_flag(TIF_IA32); + current->personality |= force_personality32; + + /* Mark the associated mm as containing 32-bit tasks. */ + set_bit(MMF_COMPAT, ¤t->mm->flags); + + /* Prepare the first "return" to user space */ + current_thread_info()->status |= TS_COMPAT; +} + unsigned long get_wchan(struct task_struct *p) { unsigned long stack; @@ -669,3 +718,20 @@ unsigned long KSTK_ESP(struct task_struc return (test_tsk_thread_flag(task, TIF_IA32)) ? (task_pt_regs(task)->sp) : ((task)->thread.usersp); } + +long do_fork_kthread(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + if (ve_allow_kthreads || ve_is_super(get_exec_env())) + return do_fork(clone_flags, stack_start, regs, stack_size, + parent_tidptr, child_tidptr); + + /* Don't allow kernel_thread() inside VE */ + printk("kernel_thread call inside container\n"); + dump_stack(); + return -EPERM; +} diff -uprN linux-2.6.32/arch/x86/kernel/process.c linux-2.6.32.ovz/arch/x86/kernel/process.c --- linux-2.6.32/arch/x86/kernel/process.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/process.c 2015-06-23 04:16:16.000000000 -0700 @@ -9,14 +9,18 @@ #include #include #include +#include +#include +#include +#include #include +#include #include #include #include #include #include #include -#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -24,29 +28,26 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { + int ret; + *dst = *src; - if (src->thread.xstate) { - dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, - GFP_KERNEL); - if (!dst->thread.xstate) - return -ENOMEM; - WARN_ON((unsigned long)dst->thread.xstate & 15); - memcpy(dst->thread.xstate, src->thread.xstate, xstate_size); + if (fpu_allocated((struct fpu *)&src->thread.xstate)) { + memset(&dst->thread.xstate, 0, sizeof(dst->thread.xstate)); + ret = fpu_alloc((struct fpu *)&dst->thread.xstate); + if (ret) + return ret; + fpu_copy((struct fpu *)&dst->thread.xstate, (struct fpu *)&src->thread.xstate); } return 0; } void free_thread_xstate(struct task_struct *tsk) { - if (tsk->thread.xstate) { - kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); - tsk->thread.xstate = NULL; - } - - WARN(tsk->thread.ds_ctx, "leaking DS context\n"); + fpu_free((struct fpu *)&tsk->thread.xstate); } void free_thread_info(struct thread_info *ti) @@ -87,22 +88,36 @@ void exit_thread(void) } } +void show_regs_common(void) +{ + const char *vendor, *product, *board; + + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + if (!vendor) + vendor = ""; + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product) + product = ""; + + /* Board Name is optional */ + board = dmi_get_system_info(DMI_BOARD_NAME); + + printk(KERN_CONT "\n"); + printk(KERN_DEFAULT "Pid: %d, comm: %.20s veid: %d %s %s %.*s %s", + current->pid, current->comm, task_veid(current), print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version, VZVERSION); + printk(KERN_CONT " %s %s", vendor, product); + if (board) + printk(KERN_CONT "/%s", board); + printk(KERN_CONT "\n"); +} + void flush_thread(void) { struct task_struct *tsk = current; -#ifdef CONFIG_X86_64 - if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { - clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); - if (test_tsk_thread_flag(tsk, TIF_IA32)) { - clear_tsk_thread_flag(tsk, TIF_IA32); - } else { - set_tsk_thread_flag(tsk, TIF_IA32); - current_thread_info()->status |= TS_COMPAT; - } - } -#endif - clear_tsk_thread_flag(tsk, TIF_DEBUG); tsk->thread.debugreg0 = 0; @@ -186,12 +201,6 @@ void __switch_to_xtra(struct task_struct prev = &prev_p->thread; next = &next_p->thread; - if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || - test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) - ds_switch_to(prev_p, next_p); - else if (next->debugctlmsr != prev->debugctlmsr) - update_debugctlmsr(next->debugctlmsr); - if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { set_debugreg(next->debugreg0, 0); set_debugreg(next->debugreg1, 1); @@ -202,6 +211,17 @@ void __switch_to_xtra(struct task_struct set_debugreg(next->debugreg7, 7); } + if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ + test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) + debugctl |= DEBUGCTLMSR_BTF; + + update_debugctlmsr(debugctl); + } + if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { /* prev and next are different */ @@ -224,6 +244,7 @@ void __switch_to_xtra(struct task_struct */ memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } + propagate_user_return_notify(prev_p, next_p); } int sys_fork(struct pt_regs *regs) @@ -296,7 +317,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -366,7 +387,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -382,7 +403,7 @@ void mwait_idle_with_hints(unsigned long static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -403,7 +424,7 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); @@ -422,13 +443,13 @@ static void poll_idle(void) * * idle=mwait overrides this decision and forces the usage of mwait. */ -static int __cpuinitdata force_mwait; +static int force_mwait; #define MWAIT_INFO 0x05 #define MWAIT_ECX_EXTENDED_INFO 0x01 #define MWAIT_EDX_C1 0xf0 -static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) +int mwait_usable(const struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -451,21 +472,39 @@ static int __cpuinit mwait_usable(const } /* - * Check for AMD CPUs, which have potentially C1E support + * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. + * For more information see + * - Erratum #400 for NPT family 0xf and family 0x10 CPUs + * - Erratum #365 for family 0x11 (not affected because C1e not in use) */ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) { + u64 val; if (c->x86_vendor != X86_VENDOR_AMD) - return 0; - - if (c->x86 < 0x0F) - return 0; + goto no_c1e_idle; /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0f && c->x86_model < 0x40) - return 0; + if (c->x86 == 0x0F && c->x86_model >= 0x40) + return 1; - return 1; + if (c->x86 == 0x10) { + /* + * check OSVW bit for CPUs that are not affected + * by erratum #400 + */ + if (cpu_has(c, X86_FEATURE_OSVW)) { + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); + if (val >= 2) { + rdmsrl(MSR_AMD64_OSVW_STATUS, val); + if (!(val & BIT(1))) + goto no_c1e_idle; + } + } + return 1; + } + +no_c1e_idle: + return 0; } static cpumask_var_t c1e_mask; diff -uprN linux-2.6.32/arch/x86/kernel/ptrace.c linux-2.6.32.ovz/arch/x86/kernel/ptrace.c --- linux-2.6.32/arch/x86/kernel/ptrace.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/ptrace.c 2015-06-23 04:16:16.000000000 -0700 @@ -2,9 +2,6 @@ /* * Pentium III FXSR, SSE support * Gareth Hughes , May 2000 - * - * BTS tracing - * Markus Metzger , Dec 2007 */ #include @@ -21,7 +18,7 @@ #include #include #include -#include +#include #include #include @@ -33,7 +30,7 @@ #include #include #include -#include +#include #include "tls.h" @@ -45,10 +42,123 @@ enum x86_regset { REGSET_FP, REGSET_XFP, REGSET_IOPERM64 = REGSET_XFP, + REGSET_XSTATE, REGSET_TLS, REGSET_IOPERM32, }; +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { +#ifdef CONFIG_X86_64 + REG_OFFSET_NAME(r15), + REG_OFFSET_NAME(r14), + REG_OFFSET_NAME(r13), + REG_OFFSET_NAME(r12), + REG_OFFSET_NAME(r11), + REG_OFFSET_NAME(r10), + REG_OFFSET_NAME(r9), + REG_OFFSET_NAME(r8), +#endif + REG_OFFSET_NAME(bx), + REG_OFFSET_NAME(cx), + REG_OFFSET_NAME(dx), + REG_OFFSET_NAME(si), + REG_OFFSET_NAME(di), + REG_OFFSET_NAME(bp), + REG_OFFSET_NAME(ax), +#ifdef CONFIG_X86_32 + REG_OFFSET_NAME(ds), + REG_OFFSET_NAME(es), + REG_OFFSET_NAME(fs), + REG_OFFSET_NAME(gs), +#endif + REG_OFFSET_NAME(orig_ax), + REG_OFFSET_NAME(ip), + REG_OFFSET_NAME(cs), + REG_OFFSET_NAME(flags), + REG_OFFSET_NAME(sp), + REG_OFFSET_NAME(ss), + REG_OFFSET_END, +}; + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} + +/** + * regs_query_register_name() - query register name from its offset + * @offset: the offset of a register in struct pt_regs. + * + * regs_query_register_name() returns the name of a register from its + * offset in struct pt_regs. If the @offset is invalid, this returns NULL; + */ +const char *regs_query_register_name(unsigned int offset) +{ + const struct pt_regs_offset *roff; + for (roff = regoffset_table; roff->name != NULL; roff++) + if (roff->offset == offset) + return roff->name; + return NULL; +} + +static const int arg_offs_table[] = { +#ifdef CONFIG_X86_32 + [0] = offsetof(struct pt_regs, ax), + [1] = offsetof(struct pt_regs, dx), + [2] = offsetof(struct pt_regs, cx) +#else /* CONFIG_X86_64 */ + [0] = offsetof(struct pt_regs, di), + [1] = offsetof(struct pt_regs, si), + [2] = offsetof(struct pt_regs, dx), + [3] = offsetof(struct pt_regs, cx), + [4] = offsetof(struct pt_regs, r8), + [5] = offsetof(struct pt_regs, r9) +#endif +}; + +/** + * regs_get_argument_nth() - get Nth argument at function call + * @regs: pt_regs which contains registers at function entry. + * @n: argument number. + * + * regs_get_argument_nth() returns @n th argument of a function call. + * Since usually the kernel stack will be changed right after function entry, + * you must use this at function entry. If the @n th entry is NOT in the + * kernel stack or pt_regs, this returns 0. + */ +unsigned long regs_get_argument_nth(struct pt_regs *regs, unsigned int n) +{ + if (n < ARRAY_SIZE(arg_offs_table)) + return *(unsigned long *)((char *)regs + arg_offs_table[n]); + else { + /* + * The typical case: arg n is on the stack. + * (Note: stack[0] = return address, so skip it) + */ + n -= ARRAY_SIZE(arg_offs_table); + return regs_get_kernel_stack_nth(regs, 1 + n); + } +} + /* * does not yet catch signals sent when the child dies. * in exit.c or in signal.c. @@ -396,6 +506,25 @@ static unsigned long getreg(struct task_ return get_desc_base(&task->thread.tls_array[GS_TLS]); } #endif +#ifdef CONFIG_VE + case offsetof(struct user_regs_struct, ax): { + struct pt_regs *regs; + unsigned long ret; + + regs = task_pt_regs(task); + ret = *pt_regs_access(regs, offset); + + if (ve_is_super(get_exec_env()) && + !ve_is_super(task->ve_task_info.owner_env) && + ((regs->orig_ax == __NR_vfork) || + (regs->orig_ax == __NR_clone) || + (regs->orig_ax == __NR_fork)) && + (long)ret > 0) + ret = vpid_to_pid_ve((pid_t)ret, task->ve_task_info.owner_env); + + return ret; + } +#endif } return *pt_regs_access(task_pt_regs(task), offset); @@ -408,14 +537,14 @@ static int genregs_get(struct task_struc { if (kbuf) { unsigned long *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { *k++ = getreg(target, pos); count -= sizeof(*k); pos += sizeof(*k); } } else { unsigned long __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { if (__put_user(getreg(target, pos), u++)) return -EFAULT; count -= sizeof(*u); @@ -434,14 +563,14 @@ static int genregs_set(struct task_struc int ret = 0; if (kbuf) { const unsigned long *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const unsigned long __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { unsigned long word; ret = __get_user(word, u++); if (ret) @@ -569,342 +698,6 @@ static int ioperm_get(struct task_struct 0, IO_BITMAP_BYTES); } -#ifdef CONFIG_X86_PTRACE_BTS -/* - * A branch trace store context. - * - * Contexts may only be installed by ptrace_bts_config() and only for - * ptraced tasks. - * - * Contexts are destroyed when the tracee is detached from the tracer. - * The actual destruction work requires interrupts enabled, so the - * work is deferred and will be scheduled during __ptrace_unlink(). - * - * Contexts hold an additional task_struct reference on the traced - * task, as well as a reference on the tracer's mm. - * - * Ptrace already holds a task_struct for the duration of ptrace operations, - * but since destruction is deferred, it may be executed after both - * tracer and tracee exited. - */ -struct bts_context { - /* The branch trace handle. */ - struct bts_tracer *tracer; - - /* The buffer used to store the branch trace and its size. */ - void *buffer; - unsigned int size; - - /* The mm that paid for the above buffer. */ - struct mm_struct *mm; - - /* The task this context belongs to. */ - struct task_struct *task; - - /* The signal to send on a bts buffer overflow. */ - unsigned int bts_ovfl_signal; - - /* The work struct to destroy a context. */ - struct work_struct work; -}; - -static int alloc_bts_buffer(struct bts_context *context, unsigned int size) -{ - void *buffer = NULL; - int err = -ENOMEM; - - err = account_locked_memory(current->mm, current->signal->rlim, size); - if (err < 0) - return err; - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto out_refund; - - context->buffer = buffer; - context->size = size; - context->mm = get_task_mm(current); - - return 0; - - out_refund: - refund_locked_memory(current->mm, size); - return err; -} - -static inline void free_bts_buffer(struct bts_context *context) -{ - if (!context->buffer) - return; - - kfree(context->buffer); - context->buffer = NULL; - - refund_locked_memory(context->mm, context->size); - context->size = 0; - - mmput(context->mm); - context->mm = NULL; -} - -static void free_bts_context_work(struct work_struct *w) -{ - struct bts_context *context; - - context = container_of(w, struct bts_context, work); - - ds_release_bts(context->tracer); - put_task_struct(context->task); - free_bts_buffer(context); - kfree(context); -} - -static inline void free_bts_context(struct bts_context *context) -{ - INIT_WORK(&context->work, free_bts_context_work); - schedule_work(&context->work); -} - -static inline struct bts_context *alloc_bts_context(struct task_struct *task) -{ - struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); - if (context) { - context->task = task; - task->bts = context; - - get_task_struct(task); - } - - return context; -} - -static int ptrace_bts_read_record(struct task_struct *child, size_t index, - struct bts_struct __user *out) -{ - struct bts_context *context; - const struct bts_trace *trace; - struct bts_struct bts; - const unsigned char *at; - int error; - - context = child->bts; - if (!context) - return -ESRCH; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - at = trace->ds.top - ((index + 1) * trace->ds.size); - if ((void *)at < trace->ds.begin) - at += (trace->ds.n * trace->ds.size); - - if (!trace->read) - return -EOPNOTSUPP; - - error = trace->read(context->tracer, at, &bts); - if (error < 0) - return error; - - if (copy_to_user(out, &bts, sizeof(bts))) - return -EFAULT; - - return sizeof(bts); -} - -static int ptrace_bts_drain(struct task_struct *child, - long size, - struct bts_struct __user *out) -{ - struct bts_context *context; - const struct bts_trace *trace; - const unsigned char *at; - int error, drained = 0; - - context = child->bts; - if (!context) - return -ESRCH; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - if (!trace->read) - return -EOPNOTSUPP; - - if (size < (trace->ds.top - trace->ds.begin)) - return -EIO; - - for (at = trace->ds.begin; (void *)at < trace->ds.top; - out++, drained++, at += trace->ds.size) { - struct bts_struct bts; - - error = trace->read(context->tracer, at, &bts); - if (error < 0) - return error; - - if (copy_to_user(out, &bts, sizeof(bts))) - return -EFAULT; - } - - memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - - error = ds_reset_bts(context->tracer); - if (error < 0) - return error; - - return drained; -} - -static int ptrace_bts_config(struct task_struct *child, - long cfg_size, - const struct ptrace_bts_config __user *ucfg) -{ - struct bts_context *context; - struct ptrace_bts_config cfg; - unsigned int flags = 0; - - if (cfg_size < sizeof(cfg)) - return -EIO; - - if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - return -EFAULT; - - context = child->bts; - if (!context) - context = alloc_bts_context(child); - if (!context) - return -ENOMEM; - - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { - if (!cfg.signal) - return -EINVAL; - - return -EOPNOTSUPP; - context->bts_ovfl_signal = cfg.signal; - } - - ds_release_bts(context->tracer); - context->tracer = NULL; - - if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { - int err; - - free_bts_buffer(context); - if (!cfg.size) - return 0; - - err = alloc_bts_buffer(context, cfg.size); - if (err < 0) - return err; - } - - if (cfg.flags & PTRACE_BTS_O_TRACE) - flags |= BTS_USER; - - if (cfg.flags & PTRACE_BTS_O_SCHED) - flags |= BTS_TIMESTAMPS; - - context->tracer = - ds_request_bts_task(child, context->buffer, context->size, - NULL, (size_t)-1, flags); - if (unlikely(IS_ERR(context->tracer))) { - int error = PTR_ERR(context->tracer); - - free_bts_buffer(context); - context->tracer = NULL; - return error; - } - - return sizeof(cfg); -} - -static int ptrace_bts_status(struct task_struct *child, - long cfg_size, - struct ptrace_bts_config __user *ucfg) -{ - struct bts_context *context; - const struct bts_trace *trace; - struct ptrace_bts_config cfg; - - context = child->bts; - if (!context) - return -ESRCH; - - if (cfg_size < sizeof(cfg)) - return -EIO; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - memset(&cfg, 0, sizeof(cfg)); - cfg.size = trace->ds.end - trace->ds.begin; - cfg.signal = context->bts_ovfl_signal; - cfg.bts_size = sizeof(struct bts_struct); - - if (cfg.signal) - cfg.flags |= PTRACE_BTS_O_SIGNAL; - - if (trace->ds.flags & BTS_USER) - cfg.flags |= PTRACE_BTS_O_TRACE; - - if (trace->ds.flags & BTS_TIMESTAMPS) - cfg.flags |= PTRACE_BTS_O_SCHED; - - if (copy_to_user(ucfg, &cfg, sizeof(cfg))) - return -EFAULT; - - return sizeof(cfg); -} - -static int ptrace_bts_clear(struct task_struct *child) -{ - struct bts_context *context; - const struct bts_trace *trace; - - context = child->bts; - if (!context) - return -ESRCH; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - - return ds_reset_bts(context->tracer); -} - -static int ptrace_bts_size(struct task_struct *child) -{ - struct bts_context *context; - const struct bts_trace *trace; - - context = child->bts; - if (!context) - return -ESRCH; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - return (trace->ds.top - trace->ds.begin) / trace->ds.size; -} - -/* - * Called from __ptrace_unlink() after the child has been moved back - * to its original parent. - */ -void ptrace_bts_untrace(struct task_struct *child) -{ - if (unlikely(child->bts)) { - free_bts_context(child->bts); - child->bts = NULL; - } -} -#endif /* CONFIG_X86_PTRACE_BTS */ - /* * Called by kernel/ptrace.c when detaching.. * @@ -1032,39 +825,6 @@ long arch_ptrace(struct task_struct *chi break; #endif - /* - * These bits need more cooking - not enabled yet: - */ -#ifdef CONFIG_X86_PTRACE_BTS - case PTRACE_BTS_CONFIG: - ret = ptrace_bts_config - (child, data, (struct ptrace_bts_config __user *)addr); - break; - - case PTRACE_BTS_STATUS: - ret = ptrace_bts_status - (child, data, (struct ptrace_bts_config __user *)addr); - break; - - case PTRACE_BTS_SIZE: - ret = ptrace_bts_size(child); - break; - - case PTRACE_BTS_GET: - ret = ptrace_bts_read_record - (child, data, (struct bts_struct __user *) addr); - break; - - case PTRACE_BTS_CLEAR: - ret = ptrace_bts_clear(child); - break; - - case PTRACE_BTS_DRAIN: - ret = ptrace_bts_drain - (child, data, (struct bts_struct __user *) addr); - break; -#endif /* CONFIG_X86_PTRACE_BTS */ - default: ret = ptrace_request(child, request, addr, data); break; @@ -1219,14 +979,14 @@ static int genregs32_get(struct task_str { if (kbuf) { compat_ulong_t *k = kbuf; - while (count > 0) { + while (count >= sizeof(*k)) { getreg32(target, pos, k++); count -= sizeof(*k); pos += sizeof(*k); } } else { compat_ulong_t __user *u = ubuf; - while (count > 0) { + while (count >= sizeof(*u)) { compat_ulong_t word; getreg32(target, pos, &word); if (__put_user(word, u++)) @@ -1247,14 +1007,14 @@ static int genregs32_set(struct task_str int ret = 0; if (kbuf) { const compat_ulong_t *k = kbuf; - while (count > 0 && !ret) { + while (count >= sizeof(*k) && !ret) { ret = putreg32(target, pos, *k++); count -= sizeof(*k); pos += sizeof(*k); } } else { const compat_ulong_t __user *u = ubuf; - while (count > 0 && !ret) { + while (count >= sizeof(*u) && !ret) { compat_ulong_t word; ret = __get_user(word, u++); if (ret) @@ -1324,14 +1084,6 @@ long compat_arch_ptrace(struct task_stru case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: -#ifdef CONFIG_X86_PTRACE_BTS - case PTRACE_BTS_CONFIG: - case PTRACE_BTS_STATUS: - case PTRACE_BTS_SIZE: - case PTRACE_BTS_GET: - case PTRACE_BTS_CLEAR: - case PTRACE_BTS_DRAIN: -#endif /* CONFIG_X86_PTRACE_BTS */ return arch_ptrace(child, request, addr, data); default: @@ -1345,7 +1097,7 @@ long compat_arch_ptrace(struct task_stru #ifdef CONFIG_X86_64 -static const struct user_regset x86_64_regsets[] = { +static struct user_regset x86_64_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1358,6 +1110,12 @@ static const struct user_regset x86_64_r .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_IOPERM64] = { .core_note_type = NT_386_IOPERM, .n = IO_BITMAP_LONGS, @@ -1383,7 +1141,7 @@ static const struct user_regset_view use #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static const struct user_regset x86_32_regsets[] = { +static struct user_regset x86_32_regsets[] __read_mostly = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1402,6 +1160,12 @@ static const struct user_regset x86_32_r .size = sizeof(u32), .align = sizeof(u32), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_XSTATE] = { + .core_note_type = NT_X86_XSTATE, + .size = sizeof(u64), .align = sizeof(u64), + .active = xstateregs_active, .get = xstateregs_get, + .set = xstateregs_set + }, [REGSET_TLS] = { .core_note_type = NT_386_TLS, .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, @@ -1424,6 +1188,23 @@ static const struct user_regset_view use }; #endif +/* + * This represents bytes 464..511 in the memory layout exported through + * the REGSET_XSTATE interface. + */ +u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; + +void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +{ +#ifdef CONFIG_X86_64 + x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64); +#endif + xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask; +} + const struct user_regset_view *task_user_regset_view(struct task_struct *task) { #ifdef CONFIG_IA32_EMULATION @@ -1437,21 +1218,33 @@ const struct user_regset_view *task_user #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, - int error_code, int si_code) +static void fill_sigtrap_info(struct task_struct *tsk, + struct pt_regs *regs, + int error_code, int si_code, + struct siginfo *info) { - struct siginfo info; - tsk->thread.trap_no = 1; tsk->thread.error_code = error_code; - memset(&info, 0, sizeof(info)); - info.si_signo = SIGTRAP; - info.si_code = si_code; + memset(info, 0, sizeof(*info)); + info->si_signo = SIGTRAP; + info->si_code = si_code; + info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; +} + +void user_single_step_siginfo(struct task_struct *tsk, + struct pt_regs *regs, + struct siginfo *info) +{ + fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info); +} - /* User-mode ip? */ - info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) +{ + struct siginfo info; + fill_sigtrap_info(tsk, regs, error_code, si_code, &info); /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); } @@ -1469,7 +1262,7 @@ void send_sigtrap(struct task_struct *ts * We must return the syscall number to actually look up in the table. * This can be -1L to skip running any syscall at all. */ -asmregparm long syscall_trace_enter(struct pt_regs *regs) +long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; @@ -1514,31 +1307,23 @@ asmregparm long syscall_trace_enter(stru return ret ?: regs->orig_ax; } -asmregparm void syscall_trace_leave(struct pt_regs *regs) +void syscall_trace_leave(struct pt_regs *regs) { - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); + bool step; + + audit_syscall_exit(regs); if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) trace_sys_exit(regs, regs->ax); - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, 0); - /* * If TIF_SYSCALL_EMU is set, we only get here because of * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). * We already reported this syscall instruction in - * syscall_trace_enter(), so don't do any more now. - */ - if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) - return; - - /* - * If we are single-stepping, synthesize a trap to follow the - * system call instruction. + * syscall_trace_enter(). */ - if (test_thread_flag(TIF_SINGLESTEP) && - tracehook_consider_fatal_signal(current, SIGTRAP)) - send_sigtrap(current, regs, 0, TRAP_BRKPT); + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && + !test_thread_flag(TIF_SYSCALL_EMU); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); } diff -uprN linux-2.6.32/arch/x86/kernel/pvclock.c linux-2.6.32.ovz/arch/x86/kernel/pvclock.c --- linux-2.6.32/arch/x86/kernel/pvclock.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/pvclock.c 2015-03-10 13:22:11.000000000 -0700 @@ -31,50 +31,21 @@ struct pvclock_shadow_time { u32 tsc_to_nsec_mul; int tsc_shift; u32 version; + u8 flags; }; -/* - * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, - * yielding a 64-bit result. - */ -static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) -{ - u64 product; -#ifdef __i386__ - u32 tmp1, tmp2; -#endif +static u8 valid_flags __read_mostly = 0; - if (shift < 0) - delta >>= -shift; - else - delta <<= shift; - -#ifdef __i386__ - __asm__ ( - "mul %5 ; " - "mov %4,%%eax ; " - "mov %%edx,%4 ; " - "mul %5 ; " - "xor %5,%5 ; " - "add %4,%%eax ; " - "adc %5,%%edx ; " - : "=A" (product), "=r" (tmp1), "=r" (tmp2) - : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); -#elif defined(__x86_64__) - __asm__ ( - "mul %%rdx ; shrd $32,%%rdx,%%rax" - : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); -#else -#error implement me! -#endif - - return product; +void pvclock_set_flags(u8 flags) +{ + valid_flags = flags; } static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) { u64 delta = native_read_tsc() - shadow->tsc_timestamp; - return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); + return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul, + shadow->tsc_shift); } /* @@ -91,6 +62,7 @@ static unsigned pvclock_get_time_values( dst->system_timestamp = src->system_time; dst->tsc_to_nsec_mul = src->tsc_to_system_mul; dst->tsc_shift = src->tsc_shift; + dst->flags = src->flags; rmb(); /* test version after fetching data */ } while ((src->version & 1) || (dst->version != src->version)); @@ -109,11 +81,19 @@ unsigned long pvclock_tsc_khz(struct pvc return pv_tsc_khz; } +static atomic64_t last_value = ATOMIC64_INIT(0); + +void pvclock_resume(void) +{ + atomic64_set(&last_value, 0); +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; unsigned version; cycle_t ret, offset; + u64 last; do { version = pvclock_get_time_values(&shadow, src); @@ -123,6 +103,31 @@ cycle_t pvclock_clocksource_read(struct barrier(); } while (version != src->version); + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && + (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) + return ret; + + /* + * Assumption here is that last_value, a global accumulator, always goes + * forward. If we are less than that, we should not be much smaller. + * We assume there is an error marging we're inside, and then the correction + * does not sacrifice accuracy. + * + * For reads: global may have changed between test and return, + * but this means someone else updated poked the clock at a later time. + * We just need to make sure we are not seeing a backwards event. + * + * For updates: last_value = ret is not enough, since two vcpus could be + * updating at the same time, and one of them could be slightly behind, + * making the assumption that last_value always go forward fail to hold. + */ + last = atomic64_read(&last_value); + do { + if (ret < last) + return last; + last = atomic64_cmpxchg(&last_value, last, ret); + } while (unlikely(last != ret)); + return ret; } diff -uprN linux-2.6.32/arch/x86/kernel/quirks.c linux-2.6.32.ovz/arch/x86/kernel/quirks.c --- linux-2.6.32/arch/x86/kernel/quirks.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/quirks.c 2015-03-10 13:24:05.000000000 -0700 @@ -3,9 +3,11 @@ */ #include #include - +#include #include +struct pci_dev *mcp55_rewrite = NULL; + #if defined(CONFIG_X86_IO_APIC) && defined(CONFIG_SMP) && defined(CONFIG_PCI) static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) @@ -45,12 +47,72 @@ static void __devinit quirk_intel_irqbal if (!(config & 0x2)) pci_write_config_byte(dev, 0xf4, config); } + +static void __devinit check_mcp55_legacy_irq_routing(struct pci_dev *dev) +{ + u32 cfg; + printk(KERN_WARNING "FOUND MCP55 CHIP\n"); + /* + *Some MCP55 chips have a legacy irq routing config register, and most + *BIOS engineers have set it so that legacy interrupts are only routed + *to the BSP. While this makes sense in most cases, it doesn't work + *for kexec, since we might wind up booting on a processor other than + *the BSP. The right fix for this is to move to symmetric io mode, + *and enable the ioapics very early in the boot process. + *That seems like far to invasive a fix in RHEL5, so here, we're just + *going to check for the appropriate configuration, and tell kexec to + *rewrite the config register if we find that we need to broadcast + *legacy interrupts. + */ + pci_read_config_dword(dev, 0x74, &cfg); + /* + * We expect legacy interrupts to be routed to INTIN0 on the lapics of + * all processors (not just the BSP). To ensure this, bit 2 must be + * clear, and bit 15 must be clear. if either of these conditions is + * not met, we have fixups we need to preform a fixup on crash + */ + if (cfg & ((1 << 2) | (1 << 15))) { + /* + * Either bit 2 or 15 wasn't clear, so we need to + * rewrite this cfg register when starting kexec + */ + printk(KERN_WARNING + "DETECTED RESTRICTED ROUTING ON MCP55! FLAGGING\n"); + mcp55_rewrite = dev; + } +} + + DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0360, + check_mcp55_legacy_irq_routing); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_NVIDIA, 0x0364, + check_mcp55_legacy_irq_routing); +#endif + +#ifdef CONFIG_INTR_REMAP +static void intel_remapping_check(struct pci_dev *dev) +{ + u8 revision; + + pci_read_config_byte(dev, PCI_REVISION_ID, &revision); + + if ((revision <= 0x13) && intr_remapping_enabled) { + pr_warn(HW_ERR "This system BIOS has enabled interrupt remapping\n" + "on a chipset that contains an errata making that\n" + "feature unstable. Please reboot with intremap=off\n" + "added to the kernel command line and contact\n" + "your BIOS vendor for an update"); + } +} + +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5520_IOHUB, intel_remapping_check); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_5500_IOHUB, intel_remapping_check); #endif #if defined(CONFIG_HPET_TIMER) @@ -491,6 +553,19 @@ void force_hpet_resume(void) break; } } + +/* + * HPET MSI on some boards (ATI SB700/SB800) has side effect on + * floppy DMA. Disable HPET MSI on such platforms. + */ +static void force_disable_hpet_msi(struct pci_dev *unused) +{ + hpet_msi_disable = 1; +} + +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + force_disable_hpet_msi); + #endif #if defined(CONFIG_PCI) && defined(CONFIG_NUMA) @@ -499,6 +574,7 @@ static void __init quirk_amd_nb_node(str { struct pci_dev *nb_ht; unsigned int devfn; + u32 node; u32 val; devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); @@ -507,7 +583,13 @@ static void __init quirk_amd_nb_node(str return; pci_read_config_dword(nb_ht, 0x60, &val); - set_dev_node(&dev->dev, val & 7); + node = val & 7; + /* + * Some hardware may return an invalid node ID, + * so check it first: + */ + if (node_online(node)) + set_dev_node(&dev->dev, node); pci_dev_put(nb_ht); } @@ -529,4 +611,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AM quirk_amd_nb_node); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, + quirk_amd_nb_node); + #endif diff -uprN linux-2.6.32/arch/x86/kernel/reboot.c linux-2.6.32.ovz/arch/x86/kernel/reboot.c --- linux-2.6.32/arch/x86/kernel/reboot.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/reboot.c 2015-03-10 13:24:04.000000000 -0700 @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -34,7 +35,7 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) @@ -203,6 +204,15 @@ static struct dmi_system_id __initdata r DMI_MATCH(DMI_BOARD_NAME, "0T656F"), }, }, + { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/ + .callback = set_bios_reboot, + .ident = "Dell OptiPlex 760", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"), + DMI_MATCH(DMI_BOARD_NAME, "0G919G"), + }, + }, { /* Handle problems with rebooting on Dell 2400's */ .callback = set_bios_reboot, .ident = "Dell PowerEdge 2400", @@ -259,6 +269,14 @@ static struct dmi_system_id __initdata r DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), }, }, + { /* Handle problems with rebooting on ASUS P4S800 */ + .callback = set_bios_reboot, + .ident = "ASUS P4S800", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."), + DMI_MATCH(DMI_BOARD_NAME, "P4S800"), + }, + }, { } }; @@ -444,6 +462,14 @@ static struct dmi_system_id __initdata p DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), }, }, + { /* Handle problems with rebooting on the iMac9,1. */ + .callback = set_pci_reboot, + .ident = "Apple iMac9,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), + }, + }, { } }; @@ -511,9 +537,23 @@ void __attribute__((weak)) mach_reboot_f { } +/* + * Windows does the following on reboot: + * 1) If the FADT has the ACPI reboot register flag set, try it + * 2) If still alive, write to the keyboard controller + * 3) If still alive, write to the ACPI reboot register again + * 4) If still alive, write to the keyboard controller again + * + * If the machine is still alive at this stage, it gives up. We default to + * following the same pattern, except that if we're still alive after (4) we'll + * try to force a triple fault and then cycle between hitting the keyboard + * controller and doing that + */ static void native_machine_emergency_restart(void) { int i; + int attempt = 0; + int orig_reboot_type = reboot_type; if (reboot_emergency) emergency_vmx_disable_all(); @@ -535,6 +575,13 @@ static void native_machine_emergency_res outb(0xfe, 0x64); /* pulse reset low */ udelay(50); } + if (attempt == 0 && orig_reboot_type == BOOT_ACPI) { + attempt = 1; + reboot_type = BOOT_ACPI; + } else { + reboot_type = BOOT_TRIPLE; + } + break; case BOOT_TRIPLE: load_idt(&no_idt); @@ -605,16 +652,19 @@ void native_machine_shutdown(void) /* Make certain I only run on the appropriate processor */ set_cpus_allowed_ptr(current, cpumask_of(reboot_cpu_id)); - /* O.K Now that I'm on the appropriate processor, - * stop all of the others. + /* + * O.K Now that I'm on the appropriate processor, stop all of the + * others. Also disable the local irq to not receive the per-cpu + * timer interrupt which may trigger scheduler's load balance. */ - smp_send_stop(); + local_irq_disable(); + stop_other_cpus(); #endif lapic_shutdown(); #ifdef CONFIG_X86_IO_APIC - disable_IO_APIC(); + disable_IO_APIC(0); #endif #ifdef CONFIG_HPET_TIMER @@ -751,6 +801,8 @@ static void smp_send_nmi_allbutself(void static struct notifier_block crash_nmi_nb = { .notifier_call = crash_nmi_callback, + /* we want to be the first one called */ + .priority = NMI_LOCAL_HIGH_PRIOR+1, }; /* Halt all other CPUs, calling the specified function on each of them diff -uprN linux-2.6.32/arch/x86/kernel/resource.c linux-2.6.32.ovz/arch/x86/kernel/resource.c --- linux-2.6.32/arch/x86/kernel/resource.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/resource.c 2015-03-10 13:22:50.000000000 -0700 @@ -0,0 +1,48 @@ +#include +#include + +static void resource_clip(struct resource *res, resource_size_t start, + resource_size_t end) +{ + resource_size_t low = 0, high = 0; + + if (res->end < start || res->start > end) + return; /* no conflict */ + + if (res->start < start) + low = start - res->start; + + if (res->end > end) + high = res->end - end; + + /* Keep the area above or below the conflict, whichever is larger */ + if (low > high) + res->end = start - 1; + else + res->start = end + 1; +} + +static void remove_e820_regions(struct resource *avail) +{ + int i; + struct e820entry *entry; + + for (i = 0; i < e820.nr_map; i++) { + entry = &e820.map[i]; + + resource_clip(avail, entry->addr, + entry->addr + entry->size - 1); + } +} + +void arch_remove_reservations(struct resource *avail) +{ + /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */ + if (avail->flags & IORESOURCE_MEM) { + if (avail->start < BIOS_END) + avail->start = BIOS_END; + resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END); + + remove_e820_regions(avail); + } +} diff -uprN linux-2.6.32/arch/x86/kernel/setup.c linux-2.6.32.ovz/arch/x86/kernel/setup.c --- linux-2.6.32/arch/x86/kernel/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/setup.c 2015-06-23 04:16:16.000000000 -0700 @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include #include #include +#include #include #include @@ -73,6 +75,7 @@ #include #include +#include #include #include #include @@ -109,6 +112,7 @@ #ifdef CONFIG_X86_64 #include #endif +#include /* * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. @@ -171,9 +175,14 @@ static struct resource bss_resource = { #ifdef CONFIG_X86_32 /* cpu data as detected by the assembly code in head.S */ struct cpuinfo_x86 new_cpu_data __cpuinitdata = {0, 0, 0, 0, -1, 1, 0, 0, -1}; +struct cpuinfo_x86_rh new_cpu_data_rh __cpuinitdata; /* common cpu data for all cpus */ struct cpuinfo_x86 boot_cpu_data __read_mostly = {0, 0, 0, 0, -1, 1, 0, 0, -1}; +/* This symbol should not be on kabi whitelists */ +struct cpuinfo_x86_rh boot_cpu_data_rh; EXPORT_SYMBOL(boot_cpu_data); +EXPORT_SYMBOL(boot_cpu_data_rh); + static void set_mca_bus(int x) { #ifdef CONFIG_MCA @@ -203,7 +212,10 @@ struct ist_info ist_info; struct cpuinfo_x86 boot_cpu_data __read_mostly = { .x86_phys_bits = MAX_PHYSMEM_BITS, }; +/* This symbol should not be on kabi whitelists */ +struct cpuinfo_x86_rh boot_cpu_data_rh __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +EXPORT_SYMBOL(boot_cpu_data_rh); #endif @@ -304,6 +316,20 @@ static void __init reserve_brk(void) _brk_start = 0; } +static unsigned long __init reserve_log_buf(unsigned long len) +{ + unsigned long end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; + unsigned long addr = end_of_lowmem - len; + unsigned long mem; + + mem = find_e820_area(addr, end_of_lowmem, len, PAGE_SIZE); + if (mem == -1ULL) + return 0ULL; + + reserve_early(mem, mem + len, "LOG BUF"); + return mem; +} + #ifdef CONFIG_BLK_DEV_INITRD #define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) @@ -493,10 +519,10 @@ static void __init reserve_early_setup_d * Returns the base address on success, and -1ULL on failure. */ static -unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) +unsigned long long __init find_and_reserve_crashkernel(unsigned long long start, + unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ - unsigned long long start = 0LL; while (1) { int ret; @@ -530,18 +556,19 @@ static void __init reserve_crashkernel(v { unsigned long long total_mem; unsigned long long crash_size, crash_base; + int strict; int ret; total_mem = get_total_mem(); ret = parse_crashkernel(boot_command_line, total_mem, - &crash_size, &crash_base); + &crash_size, &crash_base, &strict); if (ret != 0 || crash_size <= 0) return; - /* 0 means: find the address automatically */ - if (crash_base <= 0) { - crash_base = find_and_reserve_crashkernel(crash_size); + if (!strict) { + crash_base = find_and_reserve_crashkernel(crash_base, + crash_size); if (crash_base == -1ULL) { pr_info("crashkernel reservation failed. " "No suitable area found.\n"); @@ -628,6 +655,25 @@ static int __init setup_elfcorehdr(char early_param("elfcorehdr", setup_elfcorehdr); #endif +static bool __read_mostly map_individual_e820; +static int __init setup_map_individual_e820(char *str) +{ + map_individual_e820 = 1; + + return 0; +} +early_param("map_individual_e820", setup_map_individual_e820); + +static __init void reserve_ibft_region(void) +{ + unsigned long addr, size = 0; + + addr = find_ibft_region(&size); + + if (size) + reserve_early_overlap_ok(addr, addr + size, "ibft"); +} + #ifdef CONFIG_X86_RESERVE_LOW_64K static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { @@ -666,23 +712,94 @@ static struct dmi_system_id __initdata b DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), }, }, - { /* - * AMI BIOS with low memory corruption was found on Intel DG45ID board. - * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will + * AMI BIOS with low memory corruption was found on Intel DG45ID and + * DG45FC boards. + * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will * match only DMI_BOARD_NAME and see if there is more bad products * with this vendor. */ + { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", .matches = { DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), + }, + }, #endif {} }; +static void __init trim_bios_range(void) +{ + /* + * A special case is the first 4Kb of memory; + * This is a BIOS owned area, not kernel ram, but generally + * not listed as such in the E820 table. + */ + e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); + /* + * special case: Some BIOSen report the PC BIOS + * area (640->1Mb) as ram even though it is not. + * take them out. + */ + e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); +} + +static void rh_check_supported(void) +{ + /* The RHEL kernel does not support this hardware. The kernel will + * attempt to boot, but no support is given for this hardware */ + + + /* + * Check if we are running on VMware's hypervisor and bail out + * if we are. They run their own hypervisor and have different + * support requirements than we do. + */ + if (x86_hyper == &x86_hyper_vmware) + return; + + /* RHEL only supports Intel and AMD processors */ + if ((boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) && + (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)) { + printk(KERN_CRIT "Detected processor %s %s\n", + boot_cpu_data.x86_vendor_id, + boot_cpu_data.x86_model_id); + mark_hardware_unsupported("Processor"); + } + + /* Intel CPU family 6, model greater than 60 */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + ((boot_cpu_data.x86 == 6))) { + switch (boot_cpu_data.x86_model) { + case 77: /* Atom Avoton */ + case 70: /* Crystal Well */ + case 63: /* Grantley/Haswell EP */ + case 62: /* Ivy Town */ + case 61: /* Broadwell */ + break; + default: + if (boot_cpu_data.x86_model > 60) { + printk(KERN_CRIT + "Detected CPU family %d model %d\n", + boot_cpu_data.x86, + boot_cpu_data.x86_model); + mark_hardware_unsupported("Intel CPU model"); + } + break; + } + } +} + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -753,6 +870,7 @@ void __init setup_arch(char **cmdline_p) x86_init.oem.arch_setup(); + iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1; setup_memory_map(); parse_setup_data(); /* update the e820_saved too */ @@ -830,6 +948,7 @@ void __init setup_arch(char **cmdline_p) efi_init(); dmi_scan_machine(); + dmi_memdev_walk(); dmi_check_system(bad_bios_dmi_table); @@ -846,7 +965,7 @@ void __init setup_arch(char **cmdline_p) insert_resource(&iomem_resource, &data_resource); insert_resource(&iomem_resource, &bss_resource); - + trim_bios_range(); #ifdef CONFIG_X86_32 if (ppro_with_ram_bug()) { e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, @@ -908,10 +1027,51 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_64 if (max_pfn > max_low_pfn) { - max_pfn_mapped = init_memory_mapping(1UL<<32, - max_pfn<addr + ei->size <= 1UL << 32) + continue; + + if (ei->type == E820_RESERVED) { + if (map_begin != 0) { + max_pfn_mapped = init_memory_mapping(map_begin, map_end); + map_begin = 0; + map_end = 0; + } + continue; + } + + if (map_begin == 0) + map_begin = (ei->addr < (1UL << 32)) ? (1UL << 32) : ei->addr; + map_end = ei->addr + ei->size; + } + + if (map_begin != 0) + max_pfn_mapped = init_memory_mapping(map_begin, map_end); + + /* can we preseve max_low_pfn ?*/ + max_low_pfn = max_pfn; + } } #endif @@ -923,6 +1083,8 @@ void __init setup_arch(char **cmdline_p) if (init_ohci1394_dma_early) init_ohci1394_dma_on_all_controllers(); #endif + /* Allocate bigger log buffer as early as possible */ + setup_log_buf(reserve_log_buf); reserve_initrd(); @@ -946,6 +1108,8 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn); + pram_reserve(); + #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -957,6 +1121,15 @@ void __init setup_arch(char **cmdline_p) */ find_smp_config(); + reserve_ibft_region(); + + /* + * The EFI specification says that boot service code won't be called + * after ExitBootServices(). This is, in fact, a lie. + */ + if (efi_enabled) + efi_reserve_boot_services(); + reserve_crashkernel(); #ifdef CONFIG_X86_64 @@ -968,8 +1141,6 @@ void __init setup_arch(char **cmdline_p) dma32_reserve_bootmem(); #endif - reserve_ibft_region(); - #ifdef CONFIG_KVM_CLOCK kvmclock_init(); #endif @@ -978,6 +1149,8 @@ void __init setup_arch(char **cmdline_p) paging_init(); x86_init.paging.pagetable_setup_done(swapper_pg_dir); + setup_trampoline_page_table(); + tboot_probe(); #ifdef CONFIG_X86_64 @@ -1014,6 +1187,7 @@ void __init setup_arch(char **cmdline_p) probe_nr_irqs_gsi(); kvm_guest_init(); + xen_hvm_guest_init(); e820_reserve_resources(); e820_mark_nosave_regions(max_low_pfn); @@ -1031,6 +1205,10 @@ void __init setup_arch(char **cmdline_p) #endif #endif x86_init.oem.banner(); + + mcheck_init(); + + rh_check_supported(); } #ifdef CONFIG_X86_32 diff -uprN linux-2.6.32/arch/x86/kernel/setup_percpu.c linux-2.6.32.ovz/arch/x86/kernel/setup_percpu.c --- linux-2.6.32/arch/x86/kernel/setup_percpu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/setup_percpu.c 2015-03-10 13:23:03.000000000 -0700 @@ -55,7 +55,7 @@ EXPORT_SYMBOL(__per_cpu_offset); #define PERCPU_FIRST_CHUNK_RESERVE 0 #endif -#ifdef CONFIG_X86_32 +#if defined(CONFIG_X86_32) && !defined(CONFIG_DEBUG_VM) /** * pcpu_need_numa - determine percpu allocation needs to consider NUMA * @@ -181,10 +181,14 @@ void __init setup_per_cpu_areas(void) * Allocate percpu area. Embedding allocator is our favorite; * however, on NUMA configurations, it can result in very * sparse unit mapping and vmalloc area isn't spacious enough - * on 32bit. Use page in that case. + * on 32bit. Use page in that case and for the debug kernel. */ #ifdef CONFIG_X86_32 +#ifdef CONFIG_DEBUG_VM + if (pcpu_chosen_fc == PCPU_FC_AUTO) +#else if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa()) +#endif pcpu_chosen_fc = PCPU_FC_PAGE; #endif rc = -EINVAL; diff -uprN linux-2.6.32/arch/x86/kernel/sfi.c linux-2.6.32.ovz/arch/x86/kernel/sfi.c --- linux-2.6.32/arch/x86/kernel/sfi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/sfi.c 2015-03-10 13:24:03.000000000 -0700 @@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC -static u32 gsi_base; static int __init sfi_parse_ioapic(struct sfi_table_header *table) { @@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struc pentry = (struct sfi_apic_table_entry *)sb->pentry; for (i = 0; i < num; i++) { - mp_register_ioapic(i, pentry->phys_addr, gsi_base); - gsi_base += io_apic_get_redir_entries(i); + mp_register_ioapic(i, pentry->phys_addr, gsi_top); pentry++; } diff -uprN linux-2.6.32/arch/x86/kernel/signal.c linux-2.6.32.ovz/arch/x86/kernel/signal.c --- linux-2.6.32/arch/x86/kernel/signal.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/signal.c 2015-03-10 13:21:01.000000000 -0700 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -872,6 +873,8 @@ do_notify_resume(struct pt_regs *regs, v if (current->replacement_session_keyring) key_replace_session_keyring(); } + if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); diff -uprN linux-2.6.32/arch/x86/kernel/smpboot.c linux-2.6.32.ovz/arch/x86/kernel/smpboot.c --- linux-2.6.32/arch/x86/kernel/smpboot.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/smpboot.c 2015-06-23 04:16:16.000000000 -0700 @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -67,10 +68,10 @@ #include #include +#include #ifdef CONFIG_X86_32 u8 apicid_2_node[MAX_APICID]; -static int low_mappings; #endif /* State of each CPU */ @@ -113,8 +114,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_core_map); DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); EXPORT_PER_CPU_SYMBOL(cpu_info); +/* Per CPU RH extended parameters */ +DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86_rh, cpu_info_rh); +EXPORT_PER_CPU_SYMBOL(cpu_info_rh); + atomic_t init_deasserted; +static void remove_siblinginfo(int cpu); + #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) /* which node each logical CPU is on */ int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 }; @@ -240,27 +247,62 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); - notify_cpu_starting(cpuid); + + /* + * Save our processor parameters. Note: this information + * is needed for clock calibration. + */ + smp_store_cpu_info(cpuid); + /* * Get our bogomips. + * Update loops_per_jiffy in cpu_data. Previous call to + * smp_store_cpu_info() stored a value that is close but not as + * accurate as the value just calculated. * * Need to enable IRQs because it can take longer and then * the NMI watchdog might kill us. */ local_irq_enable(); calibrate_delay(); + cpu_data(cpuid).loops_per_jiffy = loops_per_jiffy; local_irq_disable(); pr_debug("Stack at about %p\n", &cpuid); /* - * Save our processor parameters + * Allow the master to continue. */ - smp_store_cpu_info(cpuid); + cpumask_set_cpu(cpuid, cpu_callin_mask); /* - * Allow the master to continue. + * Wait for signal from master CPU to continue or abort. */ - cpumask_set_cpu(cpuid, cpu_callin_mask); + while (!cpumask_test_cpu(cpuid, cpu_may_complete_boot_mask) && + cpumask_test_cpu(cpuid, cpu_callout_mask)) { + cpu_relax(); + } + + /* die if master cancelled cpu_up */ + if (!cpumask_test_cpu(cpuid, cpu_may_complete_boot_mask)) + goto die; + + notify_cpu_starting(cpuid); + return; + +die: +#ifdef CONFIG_HOTPLUG_CPU + /* was set by smp_store_cpu_info->...->numa_add_cpu */ + numa_remove_cpu(cpuid); + remove_siblinginfo(cpuid); + /* was started by end_local_APIC_setup() */ + stop_apic_nmi_watchdog(NULL); + clear_local_APIC(); + /* was set by cpu_init() */ + cpumask_clear_cpu(cpuid, cpu_initialized_mask); + cpumask_clear_cpu(cpuid, cpu_callin_mask); + play_dead(); +#endif + panic("%s: Failed to online CPU%d!\n", __func__, cpuid); } /* @@ -273,8 +315,21 @@ notrace static void __cpuinit start_seco * fragile that we want to limit the things done here to the * most necessary things. */ + +#ifdef CONFIG_X86_32 + /* + * Switch away from the trampoline page-table + * + * Do this before cpu_init() because it needs to access per-cpu + * data which may not be mapped in the trampoline page-table. + */ + load_cr3(swapper_pg_dir); + __flush_tlb_all(); +#endif + vmi_bringup(); cpu_init(); + x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); @@ -286,17 +341,11 @@ notrace static void __cpuinit start_seco check_tsc_sync_target(); if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); + legacy_pic->chip->mask(0); enable_NMI_through_LVT0(); - enable_8259A_irq(0); + legacy_pic->chip->unmask(0); } -#ifdef CONFIG_X86_32 - while (low_mappings) - cpu_relax(); - __flush_tlb_all(); -#endif - /* This must be done before setting cpu_online_mask */ set_cpu_sibling_map(raw_smp_processor_id()); wmb(); @@ -320,6 +369,7 @@ notrace static void __cpuinit start_seco unlock_vector_lock(); ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + x86_platform.nmi_init(); /* enable local interrupts */ local_irq_enable(); @@ -347,6 +397,12 @@ static inline void copy_cpuinfo_x86(stru } #endif /* CONFIG_CPUMASK_OFFSTACK */ +static inline void copy_cpuinfo_x86_rh(struct cpuinfo_x86_rh *dst, + const struct cpuinfo_x86_rh *src) +{ + *dst = *src; +} + /* * The bootstrap kernel entry code has set these up. Save them for * a given CPU @@ -355,13 +411,28 @@ static inline void copy_cpuinfo_x86(stru void __cpuinit smp_store_cpu_info(int id) { struct cpuinfo_x86 *c = &cpu_data(id); + struct cpuinfo_x86_rh *rh = &cpu_data_rh(id); copy_cpuinfo_x86(c, &boot_cpu_data); + copy_cpuinfo_x86_rh(rh, &boot_cpu_data_rh); c->cpu_index = id; if (id != 0) identify_secondary_cpu(c); } +static void __cpuinit link_thread_siblings(int cpu1, int cpu2) +{ + struct cpuinfo_x86 *c1 = &cpu_data(cpu1); + struct cpuinfo_x86 *c2 = &cpu_data(cpu2); + + cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1)); + cpumask_set_cpu(cpu1, cpu_core_mask(cpu2)); + cpumask_set_cpu(cpu2, cpu_core_mask(cpu1)); + cpumask_set_cpu(cpu1, c2->llc_shared_map); + cpumask_set_cpu(cpu2, c1->llc_shared_map); +} + void __cpuinit set_cpu_sibling_map(int cpu) { @@ -374,14 +445,14 @@ void __cpuinit set_cpu_sibling_map(int c for_each_cpu(i, cpu_sibling_setup_mask) { struct cpuinfo_x86 *o = &cpu_data(i); - if (c->phys_proc_id == o->phys_proc_id && - c->cpu_core_id == o->cpu_core_id) { - cpumask_set_cpu(i, cpu_sibling_mask(cpu)); - cpumask_set_cpu(cpu, cpu_sibling_mask(i)); - cpumask_set_cpu(i, cpu_core_mask(cpu)); - cpumask_set_cpu(cpu, cpu_core_mask(i)); - cpumask_set_cpu(i, c->llc_shared_map); - cpumask_set_cpu(cpu, o->llc_shared_map); + if (cpu_has_topoext) { + if (c->phys_proc_id == o->phys_proc_id && + per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) && + c->compute_unit_id == o->compute_unit_id) + link_thread_siblings(cpu, i); + } else if (c->phys_proc_id == o->phys_proc_id && + c->cpu_core_id == o->cpu_core_id) { + link_thread_siblings(cpu, i); } } } else { @@ -545,6 +616,7 @@ wakeup_secondary_cpu_via_init(int phys_a { unsigned long send_status, accept_status = 0; int maxlvt, num_starts, j; + unsigned long flags; maxlvt = lapic_get_maxlvt(); @@ -565,6 +637,7 @@ wakeup_secondary_cpu_via_init(int phys_a /* * Send IPI */ + local_irq_save(flags); apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, phys_apicid); @@ -581,6 +654,7 @@ wakeup_secondary_cpu_via_init(int phys_a pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); + local_irq_restore(flags); mb(); atomic_set(&init_deasserted, 1); @@ -622,6 +696,7 @@ wakeup_secondary_cpu_via_init(int phys_a /* Target chip */ /* Boot on the stack */ /* Kick the second */ + local_irq_save(flags); apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), phys_apicid); @@ -634,6 +709,7 @@ wakeup_secondary_cpu_via_init(int phys_a pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); + local_irq_restore(flags); /* * Give the other CPU some time to accept the IPI. @@ -671,6 +747,26 @@ static void __cpuinit do_fork_idle(struc complete(&c_idle->done); } +/* reduce the number of lines printed when booting a large cpu count system */ +static void __cpuinit announce_cpu(int cpu, int apicid) +{ + static int current_node = -1; + int node = cpu_to_node(cpu); + + if (system_state == SYSTEM_BOOTING) { + if (node != current_node) { + if (current_node > (-1)) + pr_cont(" Ok.\n"); + current_node = node; + pr_info("Booting Node %3d, Processors ", node); + } + pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : ""); + return; + } else + pr_info("Booting Node %d Processor %d APIC 0x%x\n", + node, cpu, apicid); +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -722,6 +818,7 @@ do_rest: #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); + initial_page_table = __pa(&trampoline_pg_dir); #else clear_tsk_thread_flag(c_idle.idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); @@ -733,12 +830,17 @@ do_rest: initial_code = (unsigned long)start_secondary; stack_start.sp = (void *) c_idle.idle->thread.sp; +#ifdef CONFIG_VE + /* Cosmetic: sleep_time won't be changed afterwards for the idle + * thread; keep it 0 rather than -cycles. */ + VE_TASK_INFO(c_idle.idle)->sleep_time = 0; +#endif + /* start_ip had better be page-aligned! */ start_ip = setup_trampoline(); - /* So we see what's up */ - printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", - cpu, apicid, start_ip); + /* So we see what's up */ + announce_cpu(cpu, apicid); /* * This grunge runs the startup process for @@ -765,10 +867,12 @@ do_rest: * Kick the secondary CPU. Use the method in the APIC driver * if it's defined - or use an INIT boot APIC message otherwise: */ + preempt_disable(); if (apic->wakeup_secondary_cpu) boot_error = apic->wakeup_secondary_cpu(apicid, start_ip); else boot_error = wakeup_secondary_cpu_via_init(apicid, start_ip); + preempt_enable(); if (!boot_error) { /* @@ -785,23 +889,28 @@ do_rest: if (cpumask_test_cpu(cpu, cpu_callin_mask)) break; /* It has booted */ udelay(100); + /* + * Allow other tasks to run while we wait for the + * AP to come online. This also gives a chance + * for the MTRR work(triggered by the AP coming online) + * to be completed in the stop machine context. + */ + schedule(); } if (cpumask_test_cpu(cpu, cpu_callin_mask)) { - /* number CPUs logically, starting from 1 (BSP is 0) */ - pr_debug("OK.\n"); - printk(KERN_INFO "CPU%d: ", cpu); - print_cpu_info(&cpu_data(cpu)); - pr_debug("CPU has booted.\n"); + /* Signal AP that it may continue to boot */ + cpumask_set_cpu(cpu, cpu_may_complete_boot_mask); + pr_debug("CPU%d: has booted.\n", cpu); } else { boot_error = 1; if (*((volatile unsigned char *)trampoline_base) == 0xA5) /* trampoline started but...? */ - printk(KERN_ERR "Stuck ??\n"); + pr_err("CPU%d: Stuck ??\n", cpu); else /* trampoline code not run */ - printk(KERN_ERR "Not responding.\n"); + pr_err("CPU%d: Not responding.\n", cpu); if (apic->inquire_remote_apic) apic->inquire_remote_apic(apicid); } @@ -866,20 +975,8 @@ int __cpuinit native_cpu_up(unsigned int per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; -#ifdef CONFIG_X86_32 - /* init low mem mapping */ - clone_pgd_range(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); - flush_tlb_all(); - low_mappings = 1; - err = do_boot_cpu(apicid, cpu); - zap_low_mappings(false); - low_mappings = 0; -#else - err = do_boot_cpu(apicid, cpu); -#endif if (err) { pr_debug("do_boot_cpu failed %d\n", err); return -EIO; @@ -1049,6 +1146,7 @@ void __init native_smp_prepare_cpus(unsi smp_cpu_index_default(); current_cpu_data = boot_cpu_data; cpumask_copy(cpu_callin_mask, cpumask_of(0)); + cpumask_copy(cpu_may_complete_boot_mask, cpumask_of(0)); mb(); /* * Setup boot CPU information @@ -1065,10 +1163,6 @@ void __init native_smp_prepare_cpus(unsi } set_cpu_sibling_map(0); - enable_IR_x2apic(); -#ifdef CONFIG_X86_64 - default_setup_apic_routing(); -#endif if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); @@ -1076,6 +1170,8 @@ void __init native_smp_prepare_cpus(unsi goto out; } + default_setup_apic_routing(); + preempt_disable(); if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", @@ -1196,11 +1292,12 @@ __init void prefill_possible_map(void) total_cpus = max_t(int, possible, num_processors + disabled_cpus); - if (possible > CONFIG_NR_CPUS) { + /* nr_cpu_ids could be reduced via nr_cpus= */ + if (possible > nr_cpu_ids) { printk(KERN_WARNING "%d Processors exceeds NR_CPUS limit of %d\n", - possible, CONFIG_NR_CPUS); - possible = CONFIG_NR_CPUS; + possible, nr_cpu_ids); + possible = nr_cpu_ids; } printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", @@ -1244,22 +1341,15 @@ static void __ref remove_cpu_from_maps(i cpumask_clear_cpu(cpu, cpu_callin_mask); /* was set by cpu_init() */ cpumask_clear_cpu(cpu, cpu_initialized_mask); + /* set by do_boot_cpu() */ + cpumask_clear_cpu(cpu, cpu_may_complete_boot_mask); numa_remove_cpu(cpu); } void cpu_disable_common(void) { int cpu = smp_processor_id(); - /* - * HACK: - * Allow any queued timer interrupts to get serviced - * This is only a temporary solution until we cleanup - * fixup_irqs as we do for IA64. - */ - local_irq_enable(); - mdelay(1); - local_irq_disable(); remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ @@ -1272,6 +1362,11 @@ void cpu_disable_common(void) int native_cpu_disable(void) { int cpu = smp_processor_id(); + int ret; + + ret = check_irq_vectors_for_cpu_disable(); + if (ret) + return ret; /* * Perhaps use cpufreq to drop frequency, but that could go @@ -1300,14 +1395,16 @@ void native_cpu_die(unsigned int cpu) for (i = 0; i < 10; i++) { /* They ack this in play_dead by setting CPU_DEAD */ if (per_cpu(cpu_state, cpu) == CPU_DEAD) { - printk(KERN_INFO "CPU %d is now offline\n", cpu); + if (system_state == SYSTEM_RUNNING) + pr_info("CPU %u is now offline\n", cpu); + if (1 == num_online_cpus()) alternatives_smp_switch(0); return; } msleep(100); } - printk(KERN_ERR "CPU %u didn't die...\n", cpu); + pr_err("CPU %u didn't die...\n", cpu); } void play_dead_common(void) @@ -1327,11 +1424,88 @@ void play_dead_common(void) local_irq_disable(); } +/* + * We need to flush the caches before going to sleep, lest we have + * dirty data in our caches when we come back up. + */ +static inline void mwait_play_dead(void) +{ + unsigned int eax, ebx, ecx, edx; + unsigned int highest_cstate = 0; + unsigned int highest_subcstate = 0; + int i; + void *mwait_ptr; + + if (!(cpu_has(¤t_cpu_data, X86_FEATURE_MWAIT) && mwait_usable(¤t_cpu_data))) + return; + if (!cpu_has(¤t_cpu_data, X86_FEATURE_CLFLSH)) + return; + if (current_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return; + + eax = CPUID_MWAIT_LEAF; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + + /* + * eax will be 0 if EDX enumeration is not valid. + * Initialized below to cstate, sub_cstate value when EDX is valid. + */ + if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) { + eax = 0; + } else { + edx >>= MWAIT_SUBSTATE_SIZE; + for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { + if (edx & MWAIT_SUBSTATE_MASK) { + highest_cstate = i; + highest_subcstate = edx & MWAIT_SUBSTATE_MASK; + } + } + eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | + (highest_subcstate - 1); + } + + /* + * This should be a memory location in a cache line which is + * unlikely to be touched by other processors. The actual + * content is immaterial as it is not actually modified in any way. + */ + mwait_ptr = ¤t_thread_info()->flags; + + wbinvd(); + + while (1) { + /* + * The CLFLUSH is a workaround for erratum AAI65 for + * the Xeon 7400 series. It's not clear it is actually + * needed, but it should be harmless in either case. + * The WBINVD is insufficient due to the spurious-wakeup + * case where we return around the loop. + */ + clflush(mwait_ptr); + __monitor(mwait_ptr, 0, 0); + mb(); + __mwait(eax, 0); + } +} + +static inline void hlt_play_dead(void) +{ + if (current_cpu_data.x86 >= 4) + wbinvd(); + + while (1) { + native_halt(); + } +} + void native_play_dead(void) { play_dead_common(); tboot_shutdown(TB_SHUTDOWN_WFS); - wbinvd_halt(); + + mwait_play_dead(); /* Only returns on failure */ + hlt_play_dead(); } #else /* ... !CONFIG_HOTPLUG_CPU */ diff -uprN linux-2.6.32/arch/x86/kernel/smp.c linux-2.6.32.ovz/arch/x86/kernel/smp.c --- linux-2.6.32/arch/x86/kernel/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/smp.c 2015-06-23 04:16:16.000000000 -0700 @@ -27,6 +27,7 @@ #include #include #include +#include /* * Some notes on x86 processor bugs affecting SMP operation: * @@ -158,10 +159,10 @@ asmlinkage void smp_reboot_interrupt(voi irq_exit(); } -static void native_smp_send_stop(void) +static void native_stop_other_cpus(int wait) { unsigned long flags; - unsigned long wait; + unsigned long timeout; if (reboot_force) return; @@ -178,9 +179,12 @@ static void native_smp_send_stop(void) if (num_online_cpus() > 1) { apic->send_IPI_allbutself(REBOOT_VECTOR); - /* Don't wait longer than a second */ - wait = USEC_PER_SEC; - while (num_online_cpus() > 1 && wait--) + /* + * Don't wait longer than a second if the caller + * didn't ask us to wait. + */ + timeout = USEC_PER_SEC; + while (num_online_cpus() > 1 && (wait || timeout--)) udelay(1); } @@ -194,31 +198,92 @@ static void native_smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ +static inline void __smp_reschedule_interrupt(void) +{ + inc_irq_stat(irq_resched_count); + scheduler_ipi(); +} + void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); - inc_irq_stat(irq_resched_count); + __smp_reschedule_interrupt(); /* * KVM uses this interrupt to force a cpu out of guest mode */ } -void smp_call_function_interrupt(struct pt_regs *regs) +static inline void smp_entering_irq(void) { ack_APIC_irq(); irq_enter(); +} + +void smp_trace_reschedule_interrupt(struct pt_regs *regs) +{ + /* + * Need to call irq_enter() before calling the trace point. + * __smp_reschedule_interrupt() calls irq_enter/exit() too (in + * scheduler_ipi(). This is OK, since those functions are allowed + * to nest. + */ + smp_entering_irq(); + trace_reschedule_entry(RESCHEDULE_VECTOR); + __smp_reschedule_interrupt(); + trace_reschedule_exit(RESCHEDULE_VECTOR); + exiting_irq(); + /* + * KVM uses this interrupt to force a cpu out of guest mode + */ +} + +static inline void __smp_call_function_interrupt(void) +{ generic_smp_call_function_interrupt(); inc_irq_stat(irq_call_count); - irq_exit(); } -void smp_call_function_single_interrupt(struct pt_regs *regs) +void smp_call_function_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + __smp_call_function_interrupt(); + exiting_irq(); +} + +void smp_trace_call_function_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + trace_call_function_entry(CALL_FUNCTION_VECTOR); + __smp_call_function_interrupt(); + trace_call_function_exit(CALL_FUNCTION_VECTOR); + exiting_irq(); +} + +static inline void __smp_call_function_single_interrupt(void) { - ack_APIC_irq(); - irq_enter(); generic_smp_call_function_single_interrupt(); inc_irq_stat(irq_call_count); - irq_exit(); +} + +void smp_call_function_single_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + __smp_call_function_single_interrupt(); + exiting_irq(); +} + +void smp_trace_call_function_single_interrupt(struct pt_regs *regs) +{ + smp_entering_irq(); + trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR); + __smp_call_function_single_interrupt(); + trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR); + exiting_irq(); +} + +void send_nmi_ipi_allbutself(void) +{ + apic->send_IPI_allbutself(NMI_VECTOR); } struct smp_ops smp_ops = { @@ -226,7 +291,7 @@ struct smp_ops smp_ops = { .smp_prepare_cpus = native_smp_prepare_cpus, .smp_cpus_done = native_smp_cpus_done, - .smp_send_stop = native_smp_send_stop, + .stop_other_cpus = native_stop_other_cpus, .smp_send_reschedule = native_smp_send_reschedule, .cpu_up = native_cpu_up, diff -uprN linux-2.6.32/arch/x86/kernel/stacktrace.c linux-2.6.32.ovz/arch/x86/kernel/stacktrace.c --- linux-2.6.32/arch/x86/kernel/stacktrace.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/stacktrace.c 2015-03-10 13:23:22.000000000 -0700 @@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, u } static const struct stacktrace_ops save_stack_ops = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address, + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address, + .walk_stack = print_context_stack, }; static const struct stacktrace_ops save_stack_ops_nosched = { - .warning = save_stack_warning, - .warning_symbol = save_stack_warning_symbol, - .stack = save_stack_stack, - .address = save_stack_address_nosched, + .warning = save_stack_warning, + .warning_symbol = save_stack_warning_symbol, + .stack = save_stack_stack, + .address = save_stack_address_nosched, + .walk_stack = print_context_stack, }; /* @@ -71,22 +73,22 @@ static const struct stacktrace_ops save_ */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); + dump_trace(current, NULL, NULL, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); -void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) +void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); + dump_trace(current, regs, NULL, &save_stack_ops, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); + dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } @@ -94,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ -struct stack_frame { +struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +static int +copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; @@ -124,7 +127,7 @@ static inline void __save_stack_trace_us trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { - struct stack_frame frame; + struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; diff -uprN linux-2.6.32/arch/x86/kernel/step.c linux-2.6.32.ovz/arch/x86/kernel/step.c --- linux-2.6.32/arch/x86/kernel/step.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/step.c 2015-03-10 13:21:31.000000000 -0700 @@ -158,22 +158,6 @@ static int enable_single_step(struct tas } /* - * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running. - */ -static void write_debugctlmsr(struct task_struct *child, unsigned long val) -{ - if (child->thread.debugctlmsr == val) - return; - - child->thread.debugctlmsr = val; - - if (child != current) - return; - - update_debugctlmsr(val); -} - -/* * Enable single or block step. */ static void enable_step(struct task_struct *child, bool block) @@ -186,15 +170,17 @@ static void enable_step(struct task_stru * that uses user-mode single stepping itself. */ if (enable_single_step(child) && block) { - set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - write_debugctlmsr(child, - child->thread.debugctlmsr | DEBUGCTLMSR_BTF); - } else { - write_debugctlmsr(child, - child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); + unsigned long debugctl = get_debugctlmsr(); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + debugctl |= DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + set_tsk_thread_flag(child, TIF_BLOCKSTEP); + } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + clear_tsk_thread_flag(child, TIF_BLOCKSTEP); } } @@ -213,11 +199,13 @@ void user_disable_single_step(struct tas /* * Make sure block stepping (BTF) is disabled. */ - write_debugctlmsr(child, - child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); + if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + debugctl &= ~DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + clear_tsk_thread_flag(child, TIF_BLOCKSTEP); + } /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); diff -uprN linux-2.6.32/arch/x86/kernel/syscall_table_32.S linux-2.6.32.ovz/arch/x86/kernel/syscall_table_32.S --- linux-2.6.32/arch/x86/kernel/syscall_table_32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/syscall_table_32.S 2015-06-23 04:16:16.000000000 -0700 @@ -191,7 +191,7 @@ ENTRY(sys_call_table) .long sys_ni_syscall /* reserved for streams2 */ .long ptregs_vfork /* 190 */ .long sys_getrlimit - .long sys_mmap2 + .long sys_mmap_pgoff .long sys_truncate64 .long sys_ftruncate64 .long sys_stat64 /* 195 */ @@ -336,3 +336,36 @@ ENTRY(sys_call_table) .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open + .long sys_recvmmsg + .long sys_ni_syscall /* sys_fanotify_init */ + .long sys_ni_syscall /* sys_fanotify_mark */ + .long sys_ni_syscall /* sys_prlimit64 340 */ + .long sys_name_to_handle_at + .long sys_open_by_handle_at + .long sys_clock_adjtime + .long sys_syncfs + .long sys_sendmmsg /* 345 */ + .long sys_setns + .long sys_process_vm_readv + .long sys_process_vm_writev + .rept 500-(.-sys_call_table)/4 + .long sys_ni_syscall + .endr + .long sys_fairsched_mknod /* 500 */ + .long sys_fairsched_rmnod + .long sys_fairsched_chwt + .long sys_fairsched_mvpr + .long sys_fairsched_rate + .long sys_fairsched_vcpus /* 505 */ + .long sys_fairsched_cpumask + .long sys_fairsched_nodemask + .long sys_ni_syscall + .long sys_ni_syscall + .long sys_getluid /* 510 */ + .long sys_setluid + .long sys_setublimit + .long sys_ubstat + .long sys_ni_syscall + .long sys_ni_syscall /* 515 */ + .long sys_lchmod + .long sys_lutime diff -uprN linux-2.6.32/arch/x86/kernel/sys_i386_32.c linux-2.6.32.ovz/arch/x86/kernel/sys_i386_32.c --- linux-2.6.32/arch/x86/kernel/sys_i386_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/sys_i386_32.c 2015-06-23 04:16:16.000000000 -0700 @@ -18,37 +18,12 @@ #include #include #include - +#include #include #include #include -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file *file = NULL; - struct mm_struct *mm = current->mm; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(&mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(&mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -77,7 +52,7 @@ asmlinkage int old_mmap(struct mmap_arg_ if (a.offset & ~PAGE_MASK) goto out; - err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + err = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); out: return err; @@ -246,3 +221,4 @@ int kernel_execve(const char *filename, : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); return __res; } +EXPORT_SYMBOL(kernel_execve); diff -uprN linux-2.6.32/arch/x86/kernel/sys_x86_64.c linux-2.6.32.ovz/arch/x86/kernel/sys_x86_64.c --- linux-2.6.32/arch/x86/kernel/sys_x86_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/sys_x86_64.c 2015-03-10 13:23:56.000000000 -0700 @@ -14,35 +14,84 @@ #include #include #include +#include +#include #include #include +/* + * Align a virtual address to avoid aliasing in the I$ on AMD F15h. + * + * @flags denotes the allocation direction - bottomup or topdown - + * or vDSO; see call sites below. + */ +unsigned long align_addr(unsigned long addr, struct file *filp, + enum align_flags flags) +{ + unsigned long tmp_addr; + + /* handle 32- and 64-bit case with a single conditional */ + if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) + return addr; + + if (!(current->flags & PF_RANDOMIZE)) + return addr; + + if (!((flags & ALIGN_VDSO) || filp)) + return addr; + + tmp_addr = addr; + + /* + * We need an address which is <= than the original + * one only when in topdown direction. + */ + if (!(flags & ALIGN_TOPDOWN)) + tmp_addr += va_align.mask; + + tmp_addr &= ~va_align.mask; + + return tmp_addr; +} + +static int __init control_va_addr_alignment(char *str) +{ + /* guard against enabling this on other CPU families */ + if (va_align.flags < 0) + return 1; + + if (*str == 0) + return 1; + + if (*str == '=') + str++; + + if (!strcmp(str, "32")) + va_align.flags = ALIGN_VA_32; + else if (!strcmp(str, "64")) + va_align.flags = ALIGN_VA_64; + else if (!strcmp(str, "off")) + va_align.flags = 0; + else if (!strcmp(str, "on")) + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + else + return 0; + + return 1; +} +__setup("align_va_addr", control_va_addr_alignment); + SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) { long error; - struct file *file; - error = -EINVAL; if (off & ~PAGE_MASK) goto out; - error = -EBADF; - file = NULL; - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); + error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT); out: return error; } @@ -80,6 +129,7 @@ arch_get_unmapped_area(struct file *filp struct vm_area_struct *vma; unsigned long start_addr; unsigned long begin, end; + unsigned int unmap_factor = sysctl_unmap_area_factor; if (flags & MAP_FIXED) return addr; @@ -93,12 +143,13 @@ arch_get_unmapped_area(struct file *filp addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (end - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vma->vm_start) && + (addr >= mmap_min_addr)) return addr; } - if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) - && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; + if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))) { + if (!unmap_factor && len <= mm->cached_hole_size) + mm->cached_hole_size = 0; mm->free_area_cache = begin; } addr = mm->free_area_cache; @@ -107,6 +158,9 @@ arch_get_unmapped_area(struct file *filp start_addr = addr; full_search: + + addr = align_addr(addr, filp, 0); + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (end - len < addr) { @@ -116,7 +170,8 @@ full_search: */ if (start_addr != begin) { start_addr = addr = begin; - mm->cached_hole_size = 0; + if (likely(!unmap_factor)) + mm->cached_hole_size = 0; goto full_search; } return -ENOMEM; @@ -128,10 +183,12 @@ full_search: mm->free_area_cache = addr + len; return addr; } - if (addr + mm->cached_hole_size < vma->vm_start) + if (!unmap_factor && + addr + mm->cached_hole_size < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; + addr = align_addr(addr, filp, 0); } } @@ -144,6 +201,8 @@ arch_get_unmapped_area_topdown(struct fi struct vm_area_struct *vma; struct mm_struct *mm = current->mm; unsigned long addr = addr0; + unsigned int unmap_factor = sysctl_unmap_area_factor; + int firsttime = 1; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -161,51 +220,77 @@ arch_get_unmapped_area_topdown(struct fi addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vma->vm_start) && + (addr >= mmap_min_addr)) return addr; } /* check if free_area_cache is useful for us */ - if (len <= mm->cached_hole_size) { + if (len <= mm->cached_hole_size && !unmap_factor) { mm->cached_hole_size = 0; mm->free_area_cache = mm->mmap_base; } + again: /* either no address requested or can't fit in requested address hole */ addr = mm->free_area_cache; /* make sure it can fit in the remaining address space */ if (addr > len) { - vma = find_vma(mm, addr-len); - if (!vma || addr <= vma->vm_start) + unsigned long tmp_addr = align_addr(addr - len, filp, + ALIGN_TOPDOWN); + + vma = find_vma(mm, tmp_addr); + if ((!vma || tmp_addr + len <= vma->vm_start) && + (tmp_addr >= mmap_min_addr)) /* remember the address as a hint for next time */ - return mm->free_area_cache = addr-len; + return mm->free_area_cache = tmp_addr; } if (mm->mmap_base < len) goto bottomup; - addr = mm->mmap_base-len; + if (likely(!unmap_factor)) + addr = mm->mmap_base-len; do { + addr = align_addr(addr, filp, ALIGN_TOPDOWN); + /* * Lookup failure means no vma is above this address, * else if new region fits below vma->vm_start, * return with success: */ vma = find_vma(mm, addr); - if (!vma || addr+len <= vma->vm_start) + if (!vma || addr+len <= vma->vm_start) { + /* we hit the bottom, stop this search */ + if (addr < mmap_min_addr) + break; /* remember the address as a hint for next time */ return mm->free_area_cache = addr; + } /* remember the largest hole we saw so far */ - if (addr + mm->cached_hole_size < vma->vm_start) + if (!unmap_factor && + addr + mm->cached_hole_size < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; /* try just below the current vma->vm_start */ addr = vma->vm_start-len; } while (len < vma->vm_start); + /* + * Using the next-fit algorithm, it is possible we started + * searching below usable address space holes. Go back to the + * top and start over. + */ + if (unmap_factor && firsttime) { + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = 0; + firsttime = 0; + goto again; + } + bottomup: /* * A failed mmap() very likely causes application failure, @@ -213,14 +298,16 @@ bottomup: * can happen with large stack limits and large mmap() * allocations. */ - mm->cached_hole_size = ~0UL; + if (likely(!unmap_factor)) + mm->cached_hole_size = ~0UL; mm->free_area_cache = TASK_UNMAPPED_BASE; addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); /* * Restore the topdown base: */ mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; + if (likely(!unmap_factor)) + mm->cached_hole_size = ~0UL; return addr; } diff -uprN linux-2.6.32/arch/x86/kernel/tboot.c linux-2.6.32.ovz/arch/x86/kernel/tboot.c --- linux-2.6.32/arch/x86/kernel/tboot.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/tboot.c 2015-03-10 13:22:02.000000000 -0700 @@ -46,6 +46,7 @@ /* Global pointer to shared data; NULL means no measured launch. */ struct tboot *tboot __read_mostly; +EXPORT_SYMBOL(tboot); /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 @@ -132,7 +133,7 @@ static int map_tboot_page(unsigned long pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); @@ -175,6 +176,9 @@ static void add_mac_region(phys_addr_t s struct tboot_mac_region *mr; phys_addr_t end = start + size; + if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS) + panic("tboot: Too many MAC regions\n"); + if (start && size) { mr = &tboot->mac_regions[tboot->num_mac_regions++]; mr->start = round_down(start, PAGE_SIZE); @@ -184,18 +188,17 @@ static void add_mac_region(phys_addr_t s static int tboot_setup_sleep(void) { + int i; + tboot->num_mac_regions = 0; - /* S3 resume code */ - add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + for (i = 0; i < e820.nr_map; i++) { + if ((e820.map[i].type != E820_RAM) + && (e820.map[i].type != E820_RESERVED_KERN)) + continue; -#ifdef CONFIG_X86_TRAMPOLINE - /* AP trampoline code */ - add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); -#endif - - /* kernel code + data + bss */ - add_mac_region(virt_to_phys(_text), _end - _text); + add_mac_region(e820.map[i].addr, e820.map[i].size); + } tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; diff -uprN linux-2.6.32/arch/x86/kernel/time.c linux-2.6.32.ovz/arch/x86/kernel/time.c --- linux-2.6.32/arch/x86/kernel/time.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/time.c 2015-06-23 04:16:16.000000000 -0700 @@ -77,7 +77,23 @@ static irqreturn_t timer_interrupt(int i spin_unlock(&i8259A_lock); } - global_clock_event->event_handler(global_clock_event); + /* + * If hpet is enabled by hpet_late_init(), event_handler can be left + * uninitialized by clockevents_register_device() because of + * hpet_clockevent low rating (by the time hpet_late_init() is called, + * high prio apic timers have already been setup). The event_handler is + * then initialized a bit later by the clocksource_done_booting() + * procedure. + * + * Normally, timer interrupts should not be delivered between these two + * calls, but if e.g. the kernel is booted using kexec, there might be + * some pending interrupts from the previous kernel's context, which + * can lead to a NULL pointer dereference. + * + * So, take precautions against spurious timer interrupts. + */ + if (global_clock_event->event_handler) + global_clock_event->event_handler(global_clock_event); /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ if (MCA_bus) diff -uprN linux-2.6.32/arch/x86/kernel/tlb_uv.c linux-2.6.32.ovz/arch/x86/kernel/tlb_uv.c --- linux-2.6.32/arch/x86/kernel/tlb_uv.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/tlb_uv.c 2015-03-10 13:23:11.000000000 -0700 @@ -1,14 +1,17 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008 Cliff Wickman , SGI. + * (c) 2008-2011 Cliff Wickman , SGI. * * This code is released under the GNU General Public License version 2 or * later. */ #include #include +#include #include +#include +#include #include #include @@ -19,42 +22,129 @@ #include #include #include +#include -static struct bau_control **uv_bau_table_bases __read_mostly; -static int uv_bau_retry_limit __read_mostly; +/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ +static int timeout_base_ns[] = { + 20, + 160, + 1280, + 10240, + 81920, + 655360, + 5242880, + 167772160 +}; -/* base pnode in this partition */ -static int uv_partition_base_pnode __read_mostly; +static int timeout_us; +static int nobau; +static int baudisabled; +static spinlock_t disable_lock; +static cycles_t congested_cycles; + +/* tunables: */ +static int max_concurr = MAX_BAU_CONCURRENT; +static int max_concurr_const = MAX_BAU_CONCURRENT; +static int plugged_delay = PLUGGED_DELAY; +static int plugsb4reset = PLUGSB4RESET; +static int timeoutsb4reset = TIMEOUTSB4RESET; +static int ipi_reset_limit = IPI_RESET_LIMIT; +static int complete_threshold = COMPLETE_THRESHOLD; +static int congested_respns_us = CONGESTED_RESPONSE_US; +static int congested_reps = CONGESTED_REPS; +static int congested_period = CONGESTED_PERIOD; + +static struct tunables tunables[] = { + {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */ + {&plugged_delay, PLUGGED_DELAY}, + {&plugsb4reset, PLUGSB4RESET}, + {&timeoutsb4reset, TIMEOUTSB4RESET}, + {&ipi_reset_limit, IPI_RESET_LIMIT}, + {&complete_threshold, COMPLETE_THRESHOLD}, + {&congested_respns_us, CONGESTED_RESPONSE_US}, + {&congested_reps, CONGESTED_REPS}, + {&congested_period, CONGESTED_PERIOD} +}; + +static struct dentry *tunables_dir; +static struct dentry *tunables_file; + +/* these correspond to the statistics printed by ptc_seq_show() */ +static char *stat_description[] = { + "sent: number of shootdown messages sent", + "stime: time spent sending messages", + "numuvhubs: number of hubs targeted with shootdown", + "numuvhubs16: number times 16 or more hubs targeted", + "numuvhubs8: number times 8 or more hubs targeted", + "numuvhubs4: number times 4 or more hubs targeted", + "numuvhubs2: number times 2 or more hubs targeted", + "numuvhubs1: number times 1 hub targeted", + "numcpus: number of cpus targeted with shootdown", + "dto: number of destination timeouts", + "retries: destination timeout retries sent", + "rok: : destination timeouts successfully retried", + "resetp: ipi-style resource resets for plugs", + "resett: ipi-style resource resets for timeouts", + "giveup: fall-backs to ipi-style shootdowns", + "sto: number of source timeouts", + "bz: number of stay-busy's", + "throt: number times spun in throttle", + "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE", + "recv: shootdown messages received", + "rtime: time spent processing messages", + "all: shootdown all-tlb messages", + "one: shootdown one-tlb messages", + "mult: interrupts that found multiple messages", + "none: interrupts that found no messages", + "retry: number of retry messages processed", + "canc: number messages canceled by retries", + "nocan: number retries that found nothing to cancel", + "reset: number of ipi-style reset requests processed", + "rcan: number messages canceled by reset requests", + "disable: number times use of the BAU was disabled", + "enable: number times use of the BAU was re-enabled" +}; + +static int __init +setup_nobau(char *arg) +{ + nobau = 1; + return 0; +} +early_param("nobau", setup_nobau); -static unsigned long uv_mmask __read_mostly; +/* base pnode in this partition */ +static int uv_base_pnode __read_mostly; static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); +static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); /* - * Determine the first node on a blade. + * Determine the first node on a uvhub. 'Nodes' are used for kernel + * memory allocation. */ -static int __init blade_to_first_node(int blade) +static int __init uvhub_to_first_node(int uvhub) { int node, b; for_each_online_node(node) { b = uv_node_to_blade_id(node); - if (blade == b) + if (uvhub == b) return node; } - return -1; /* shouldn't happen */ + return -1; } /* - * Determine the apicid of the first cpu on a blade. + * Determine the apicid of the first cpu on a uvhub. */ -static int __init blade_to_first_apicid(int blade) +static int __init uvhub_to_first_apicid(int uvhub) { int cpu; for_each_present_cpu(cpu) - if (blade == uv_cpu_to_blade_id(cpu)) + if (uvhub == uv_cpu_to_blade_id(cpu)) return per_cpu(x86_cpu_to_apicid, cpu); return -1; } @@ -67,247 +157,894 @@ static int __init blade_to_first_apicid( * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void uv_reply_to_message(int resource, - struct bau_payload_queue_entry *msg, - struct bau_msg_status *msp) +static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) { unsigned long dw; + struct bau_pq_entry *msg; - dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); + msg = mdp->msg; + if (!msg->canceled && do_acknowledge) { + dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec; + write_mmr_sw_ack(dw); + } msg->replied_to = 1; - msg->sw_ack_vector = 0; - if (msp) - msp->seen_by.bits = 0; - uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); + msg->swack_vec = 0; } /* - * Do all the things a cpu should do for a TLB shootdown message. - * Other cpu's may come here at the same time for this message. + * Process the receipt of a RETRY message */ -static void uv_bau_process_message(struct bau_payload_queue_entry *msg, - int msg_slot, int sw_ack_slot) +static void bau_process_retry_msg(struct msg_desc *mdp, + struct bau_control *bcp) { - unsigned long this_cpu_mask; - struct bau_msg_status *msp; - int cpu; + int i; + int cancel_count = 0; + unsigned long msg_res; + unsigned long mmr = 0; + struct bau_pq_entry *msg = mdp->msg; + struct bau_pq_entry *msg2; + struct ptc_stats *stat = bcp->statp; - msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; - cpu = uv_blade_processor_id(); - msg->number_of_cpus = - uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); - this_cpu_mask = 1UL << cpu; - if (msp->seen_by.bits & this_cpu_mask) - return; - atomic_or_long(&msp->seen_by.bits, this_cpu_mask); + stat->d_retries++; + /* + * cancel any message from msg+1 to the retry itself + */ + for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { + if (msg2 > mdp->queue_last) + msg2 = mdp->queue_first; + if (msg2 == msg) + break; + + /* same conditions for cancellation as do_reset */ + if ((msg2->replied_to == 0) && (msg2->canceled == 0) && + (msg2->swack_vec) && ((msg2->swack_vec & + msg->swack_vec) == 0) && + (msg2->sending_cpu == msg->sending_cpu) && + (msg2->msg_type != MSG_NOOP)) { + mmr = read_mmr_sw_ack(); + msg_res = msg2->swack_vec; + /* + * This is a message retry; clear the resources held + * by the previous message only if they timed out. + * If it has not timed out we have an unexpected + * situation to report. + */ + if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { + unsigned long mr; + /* + * Is the resource timed out? + * Make everyone ignore the cancelled message. + */ + msg2->canceled = 1; + stat->d_canceled++; + cancel_count++; + mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; + write_mmr_sw_ack(mr); + } + } + } + if (!cancel_count) + stat->d_nocanceled++; +} - if (msg->replied_to == 1) - return; +/* + * Do all the things a cpu should do for a TLB shootdown message. + * Other cpu's may come here at the same time for this message. + */ +static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, + int do_acknowledge) +{ + short socket_ack_count = 0; + short *sp; + struct atomic_short *asp; + struct ptc_stats *stat = bcp->statp; + struct bau_pq_entry *msg = mdp->msg; + struct bau_control *smaster = bcp->socket_master; + /* + * This must be a normal message, or retry of a normal message + */ if (msg->address == TLB_FLUSH_ALL) { local_flush_tlb(); - __get_cpu_var(ptcstats).alltlb++; + stat->d_alltlb++; } else { __flush_tlb_one(msg->address); - __get_cpu_var(ptcstats).onetlb++; + stat->d_onetlb++; } + stat->d_requestee++; - __get_cpu_var(ptcstats).requestee++; + /* + * One cpu on each uvhub has the additional job on a RETRY + * of releasing the resource held by the message that is + * being retried. That message is identified by sending + * cpu number. + */ + if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) + bau_process_retry_msg(mdp, bcp); + + /* + * This is a swack message, so we have to reply to it. + * Count each responding cpu on the socket. This avoids + * pinging the count's cache line back and forth between + * the sockets. + */ + sp = &smaster->socket_acknowledge_count[mdp->msg_slot]; + asp = (struct atomic_short *)sp; + socket_ack_count = atom_asr(1, asp); + if (socket_ack_count == bcp->cpus_in_socket) { + int msg_ack_count; + /* + * Both sockets dump their completed count total into + * the message's count. + */ + smaster->socket_acknowledge_count[mdp->msg_slot] = 0; + asp = (struct atomic_short *)&msg->acknowledge_count; + msg_ack_count = atom_asr(socket_ack_count, asp); + + if (msg_ack_count == bcp->cpus_in_uvhub) { + /* + * All cpus in uvhub saw it; reply + * (unless we are in the UV2 workaround) + */ + reply_to_message(mdp, bcp, do_acknowledge); + } + } - atomic_inc_short(&msg->acknowledge_count); - if (msg->number_of_cpus == msg->acknowledge_count) - uv_reply_to_message(sw_ack_slot, msg, msp); + return; +} + +/* + * Determine the first cpu on a pnode. + */ +static int pnode_to_first_cpu(int pnode, struct bau_control *smaster) +{ + int cpu; + struct hub_and_pnode *hpp; + + for_each_present_cpu(cpu) { + hpp = &smaster->thp[cpu]; + if (pnode == hpp->pnode) + return cpu; + } + return -1; } /* - * Examine the payload queue on one distribution node to see - * which messages have not been seen, and which cpu(s) have not seen them. + * Last resort when we get a large number of destination timeouts is + * to clear resources held by a given cpu. + * Do this with IPI so that all messages in the BAU message queue + * can be identified by their nonzero swack_vec field. * - * Returns the number of cpu's that have not responded. + * This is entered for a single cpu on the uvhub. + * The sender want's this uvhub to free a specific message's + * swack resources. */ -static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) +static void do_reset(void *ptr) { - struct bau_payload_queue_entry *msg; - struct bau_msg_status *msp; - int count = 0; int i; - int j; + struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id()); + struct reset_args *rap = (struct reset_args *)ptr; + struct bau_pq_entry *msg; + struct ptc_stats *stat = bcp->statp; - for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; - msg++, i++) { - if ((msg->sending_cpu == sender) && (!msg->replied_to)) { - msp = bau_tablesp->msg_statuses + i; - printk(KERN_DEBUG - "blade %d: address:%#lx %d of %d, not cpu(s): ", - i, msg->address, msg->acknowledge_count, - msg->number_of_cpus); - for (j = 0; j < msg->number_of_cpus; j++) { - if (!((1L << j) & msp->seen_by.bits)) { - count++; - printk("%d ", j); - } + stat->d_resets++; + /* + * We're looking for the given sender, and + * will free its swack resource. + * If all cpu's finally responded after the timeout, its + * message 'replied_to' was set. + */ + for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { + unsigned long msg_res; + /* do_reset: same conditions for cancellation as + bau_process_retry_msg() */ + if ((msg->replied_to == 0) && + (msg->canceled == 0) && + (msg->sending_cpu == rap->sender) && + (msg->swack_vec) && + (msg->msg_type != MSG_NOOP)) { + unsigned long mmr; + unsigned long mr; + /* + * make everyone else ignore this message + */ + msg->canceled = 1; + /* + * only reset the resource if it is still pending + */ + mmr = read_mmr_sw_ack(); + msg_res = msg->swack_vec; + mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res; + if (mmr & msg_res) { + stat->d_rcanceled++; + write_mmr_sw_ack(mr); } - printk("\n"); } } - return count; + return; } /* - * Examine the payload queue on all the distribution nodes to see - * which messages have not been seen, and which cpu(s) have not seen them. - * - * Returns the number of cpu's that have not responded. + * Use IPI to get all target uvhubs to release resources held by + * a given sending cpu number. */ -static int uv_examine_destinations(struct bau_target_nodemask *distribution) +static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp) { - int sender; - int i; - int count = 0; - - sender = smp_processor_id(); - for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { - if (!bau_node_isset(i, distribution)) + int pnode; + int apnode; + int maskbits; + int sender = bcp->cpu; + cpumask_t *mask = bcp->uvhub_master->cpumask; + struct bau_control *smaster = bcp->socket_master; + struct reset_args reset_args; + + reset_args.sender = sender; + cpus_clear(*mask); + /* find a single cpu for each uvhub in this distribution mask */ + maskbits = sizeof(struct pnmask) * BITSPERBYTE; + /* each bit is a pnode relative to the partition base pnode */ + for (pnode = 0; pnode < maskbits; pnode++) { + int cpu; + if (!bau_uvhub_isset(pnode, distribution)) continue; - count += uv_examine_destination(uv_bau_table_bases[i], sender); + apnode = pnode + bcp->partition_base_pnode; + cpu = pnode_to_first_cpu(apnode, smaster); + cpu_set(cpu, *mask); } - return count; + + /* IPI all cpus; preemption is already disabled */ + smp_call_function_many(mask, do_reset, (void *)&reset_args, 1); + return; +} + +static inline unsigned long cycles_2_us(unsigned long long cyc) +{ + unsigned long long ns; + unsigned long us; + int cpu = smp_processor_id(); + + ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR; + us = ns / 1000; + return us; } /* - * wait for completion of a broadcast message - * - * return COMPLETE, RETRY or GIVEUP + * wait for all cpus on this hub to finish their sends and go quiet + * leaves uvhub_quiesce set so that no new broadcasts are started by + * bau_flush_send_and_wait() */ -static int uv_wait_completion(struct bau_desc *bau_desc, - unsigned long mmr_offset, int right_shift) +static inline void quiesce_local_uvhub(struct bau_control *hmaster) +{ + atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce); +} + +/* + * mark this quiet-requestor as done + */ +static inline void end_uvhub_quiesce(struct bau_control *hmaster) +{ + atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce); +} + +static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift) { - int exams = 0; - long destination_timeouts = 0; - long source_timeouts = 0; unsigned long descriptor_status; - while ((descriptor_status = (((unsigned long) - uv_read_local_mmr(mmr_offset) >> - right_shift) & UV_ACT_STATUS_MASK)) != - DESC_STATUS_IDLE) { - if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { - source_timeouts++; - if (source_timeouts > SOURCE_TIMEOUT_LIMIT) - source_timeouts = 0; - __get_cpu_var(ptcstats).s_retry++; - return FLUSH_RETRY; - } + descriptor_status = uv_read_local_mmr(mmr_offset); + descriptor_status >>= right_shift; + descriptor_status &= UV_ACT_STATUS_MASK; + return descriptor_status; +} + +/* + * Wait for completion of a broadcast software ack message + * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP + */ +static int uv1_wait_completion(struct bau_desc *bau_desc, + unsigned long mmr_offset, int right_shift, + struct bau_control *bcp, long try) +{ + unsigned long descriptor_status; + cycles_t ttm; + struct ptc_stats *stat = bcp->statp; + + descriptor_status = uv1_read_status(mmr_offset, right_shift); + /* spin on the status MMR, waiting for it to go idle */ + while ((descriptor_status != DS_IDLE)) { /* - * spin here looking for progress at the destinations + * Our software ack messages may be blocked because + * there are no swack resources available. As long + * as none of them has timed out hardware will NACK + * our message and its state will stay IDLE. */ - if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { - destination_timeouts++; - if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { - /* - * returns number of cpus not responding - */ - if (uv_examine_destinations - (&bau_desc->distribution) == 0) { - __get_cpu_var(ptcstats).d_retry++; - return FLUSH_RETRY; + if (descriptor_status == DS_SOURCE_TIMEOUT) { + stat->s_stimeout++; + return FLUSH_GIVEUP; + } else if (descriptor_status == DS_DESTINATION_TIMEOUT) { + stat->s_dtimeout++; + ttm = get_cycles(); + + /* + * Our retries may be blocked by all destination + * swack resources being consumed, and a timeout + * pending. In that case hardware returns the + * ERROR that looks like a destination timeout. + */ + if (cycles_2_us(ttm - bcp->send_message) < timeout_us) { + bcp->conseccompletes = 0; + return FLUSH_RETRY_PLUGGED; + } + + bcp->conseccompletes = 0; + return FLUSH_RETRY_TIMEOUT; + } else { + /* + * descriptor_status is still BUSY + */ + cpu_relax(); + } + descriptor_status = uv1_read_status(mmr_offset, right_shift); + } + bcp->conseccompletes++; + return FLUSH_COMPLETE; +} + +/* + * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register. + */ +static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc) +{ + unsigned long descriptor_status; + unsigned long descriptor_status2; + + descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK); + descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL; + descriptor_status = (descriptor_status << 1) | descriptor_status2; + return descriptor_status; +} + +/* + * Return whether the status of the descriptor that is normally used for this + * cpu (the one indexed by its hub-relative cpu number) is busy. + * The status of the original 32 descriptors is always reflected in the 64 + * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0. + * The bit provided by the activation_status_2 register is irrelevant to + * the status if it is only being tested for busy or not busy. + */ +int normal_busy(struct bau_control *bcp) +{ + int cpu = bcp->uvhub_cpu; + int mmr_offset; + int right_shift; + + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; + right_shift = cpu * UV_ACT_STATUS_SIZE; + return (((((read_lmmr(mmr_offset) >> right_shift) & + UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY); +} + +/* + * Entered when a bau descriptor has gone into a permanent busy wait because + * of a hardware bug. + * Workaround the bug. + */ +int handle_uv2_busy(struct bau_control *bcp) +{ + int busy_one = bcp->using_desc; + int normal = bcp->uvhub_cpu; + int selected = -1; + int i; + unsigned long descriptor_status; + unsigned long status; + int mmr_offset; + struct bau_desc *bau_desc_old; + struct bau_desc *bau_desc_new; + struct bau_control *hmaster = bcp->uvhub_master; + struct ptc_stats *stat = bcp->statp; + cycles_t ttm; + + stat->s_uv2_wars++; + spin_lock(&hmaster->uvhub_lock); + /* try for the original first */ + if (busy_one != normal) { + if (!normal_busy(bcp)) + selected = normal; + } + if (selected < 0) { + /* can't use the normal, select an alternate */ + mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; + descriptor_status = read_lmmr(mmr_offset); + + /* scan available descriptors 32-63 */ + for (i = 0; i < UV_CPUS_PER_AS; i++) { + if ((hmaster->inuse_map & (1 << i)) == 0) { + status = ((descriptor_status >> + (i * UV_ACT_STATUS_SIZE)) & + UV_ACT_STATUS_MASK) << 1; + if (status != UV2H_DESC_BUSY) { + selected = i + UV_CPUS_PER_AS; + break; } - exams++; - if (exams >= uv_bau_retry_limit) { - printk(KERN_DEBUG - "uv_flush_tlb_others"); - printk("giving up on cpu %d\n", - smp_processor_id()); - return FLUSH_GIVEUP; + } + } + } + + if (busy_one != normal) + /* mark the busy alternate as not in-use */ + hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS)); + + if (selected >= 0) { + /* switch to the selected descriptor */ + if (selected != normal) { + /* set the selected alternate as in-use */ + hmaster->inuse_map |= + (1 << (selected - UV_CPUS_PER_AS)); + if (selected > stat->s_uv2_wars_hw) + stat->s_uv2_wars_hw = selected; + } + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * busy_one); + bcp->using_desc = selected; + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * selected); + *bau_desc_new = *bau_desc_old; + } else { + /* + * All are busy. Wait for the normal one for this cpu to + * free up. + */ + stat->s_uv2_war_waits++; + spin_unlock(&hmaster->uvhub_lock); + ttm = get_cycles(); + do { + cpu_relax(); + } while (normal_busy(bcp)); + spin_lock(&hmaster->uvhub_lock); + /* switch to the original descriptor */ + bcp->using_desc = normal; + bau_desc_old = bcp->descriptor_base; + bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc); + bcp->using_desc = (ITEMS_PER_DESC * normal); + bau_desc_new = bcp->descriptor_base; + bau_desc_new += (ITEMS_PER_DESC * normal); + *bau_desc_new = *bau_desc_old; /* copy the entire descriptor */ + } + spin_unlock(&hmaster->uvhub_lock); + return FLUSH_RETRY_BUSYBUG; +} + +static int uv2_wait_completion(struct bau_desc *bau_desc, + unsigned long mmr_offset, int right_shift, + struct bau_control *bcp, long try) +{ + unsigned long descriptor_stat; + cycles_t ttm; + int desc = bcp->using_desc; + long busy_reps = 0; + struct ptc_stats *stat = bcp->statp; + + descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc); + + /* spin on the status MMR, waiting for it to go idle */ + while (descriptor_stat != UV2H_DESC_IDLE) { + /* + * Our software ack messages may be blocked because + * there are no swack resources available. As long + * as none of them has timed out hardware will NACK + * our message and its state will stay IDLE. + */ + if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) || + (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) { + stat->s_stimeout++; + return FLUSH_GIVEUP; + } else if (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) { + stat->s_strongnacks++; + bcp->conseccompletes = 0; + return FLUSH_GIVEUP; + } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) { + stat->s_dtimeout++; + bcp->conseccompletes = 0; + return FLUSH_RETRY_TIMEOUT; + } else { + busy_reps++; + if (busy_reps > 1000000) { + /* not to hammer on the clock */ + busy_reps = 0; + ttm = get_cycles(); + if ((ttm - bcp->send_message) > + (bcp->clocks_per_100_usec)) { + return handle_uv2_busy(bcp); } - /* - * delays can hang the simulator - udelay(1000); - */ - destination_timeouts = 0; } + /* + * descriptor_stat is still BUSY + */ + cpu_relax(); } - cpu_relax(); + descriptor_stat = uv2_read_status(mmr_offset, right_shift, + desc); } + bcp->conseccompletes++; return FLUSH_COMPLETE; } -/** - * uv_flush_send_and_wait - * - * Send a broadcast and wait for a broadcast message to complete. - * - * The flush_mask contains the cpus the broadcast was sent to. - * - * Returns NULL if all remote flushing was done. The mask is zeroed. - * Returns @flush_mask if some remote flushing remains to be done. The - * mask will have some bits still set. +/* + * There are 2 status registers; each and array[32] of 2 bits. Set up for + * which register to read and position in that register based on cpu in + * current hub. */ -const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, - struct bau_desc *bau_desc, - struct cpumask *flush_mask) +static int wait_completion(struct bau_desc *bau_desc, + struct bau_control *bcp, long try) { - int completion_status = 0; int right_shift; - int tries = 0; - int pnode; - int bit; unsigned long mmr_offset; - unsigned long index; - cycles_t time1; - cycles_t time2; + int desc = bcp->using_desc; - if (cpu < UV_CPUS_PER_ACT_STATUS) { + if (desc < UV_CPUS_PER_AS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; - right_shift = cpu * UV_ACT_STATUS_SIZE; + right_shift = desc * UV_ACT_STATUS_SIZE; } else { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1; - right_shift = - ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE); + right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE); } + + if (bcp->uvhub_version == 1) + return uv1_wait_completion(bau_desc, mmr_offset, right_shift, + bcp, try); + else + return uv2_wait_completion(bau_desc, mmr_offset, right_shift, + bcp, try); +} + +static inline cycles_t sec_2_cycles(unsigned long sec) +{ + unsigned long ns; + cycles_t cyc; + + ns = sec * 1000000000; + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); + return cyc; +} + +/* + * Our retries are blocked by all destination sw ack resources being + * in use, and a timeout is pending. In that case hardware immediately + * returns the ERROR that looks like a destination timeout. + */ +static void destination_plugged(struct bau_desc *bau_desc, + struct bau_control *bcp, + struct bau_control *hmaster, struct ptc_stats *stat) +{ + udelay(bcp->plugged_delay); + bcp->plugged_tries++; + + if (bcp->plugged_tries >= bcp->plugsb4reset) { + bcp->plugged_tries = 0; + + quiesce_local_uvhub(hmaster); + + spin_lock(&hmaster->queue_lock); + reset_with_ipi(&bau_desc->distribution, bcp); + spin_unlock(&hmaster->queue_lock); + + end_uvhub_quiesce(hmaster); + + bcp->ipi_attempts++; + stat->s_resets_plug++; + } +} + +static void destination_timeout(struct bau_desc *bau_desc, + struct bau_control *bcp, struct bau_control *hmaster, + struct ptc_stats *stat) +{ + hmaster->max_concurr = 1; + bcp->timeout_tries++; + if (bcp->timeout_tries >= bcp->timeoutsb4reset) { + bcp->timeout_tries = 0; + + quiesce_local_uvhub(hmaster); + + spin_lock(&hmaster->queue_lock); + reset_with_ipi(&bau_desc->distribution, bcp); + spin_unlock(&hmaster->queue_lock); + + end_uvhub_quiesce(hmaster); + + bcp->ipi_attempts++; + stat->s_resets_timeout++; + } +} + +/* + * Completions are taking a very long time due to a congested numalink + * network. + */ +static void disable_for_congestion(struct bau_control *bcp, + struct ptc_stats *stat) +{ + /* let only one cpu do this disabling */ + spin_lock(&disable_lock); + + if (!baudisabled && bcp->period_requests && + ((bcp->period_time / bcp->period_requests) > congested_cycles)) { + int tcpu; + struct bau_control *tbcp; + /* it becomes this cpu's job to turn on the use of the + BAU again */ + baudisabled = 1; + bcp->set_bau_off = 1; + bcp->set_bau_on_time = get_cycles(); + bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period); + stat->s_bau_disabled++; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 1; + } + } + + spin_unlock(&disable_lock); +} + +static void count_max_concurr(int stat, struct bau_control *bcp, + struct bau_control *hmaster) +{ + bcp->plugged_tries = 0; + bcp->timeout_tries = 0; + if (stat != FLUSH_COMPLETE) + return; + if (bcp->conseccompletes <= bcp->complete_threshold) + return; + if (hmaster->max_concurr >= hmaster->max_concurr_const) + return; + hmaster->max_concurr++; +} + +static void record_send_stats(cycles_t time1, cycles_t time2, + struct bau_control *bcp, struct ptc_stats *stat, + int completion_status, int try) +{ + cycles_t elapsed; + + if (time2 > time1) { + elapsed = time2 - time1; + stat->s_time += elapsed; + + if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { + bcp->period_requests++; + bcp->period_time += elapsed; + if ((elapsed > congested_cycles) && + (bcp->period_requests > bcp->cong_reps)) + disable_for_congestion(bcp, stat); + } + } else + stat->s_requestor--; + + if (completion_status == FLUSH_COMPLETE && try > 1) + stat->s_retriesok++; + else if (completion_status == FLUSH_GIVEUP) + stat->s_giveup++; +} + +/* + * Because of a uv1 hardware bug only a limited number of concurrent + * requests can be made. + */ +static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat) +{ + spinlock_t *lock = &hmaster->uvhub_lock; + atomic_t *v; + + v = &hmaster->active_descriptor_count; + if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) { + stat->s_throttles++; + do { + cpu_relax(); + } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)); + } +} + +/* + * Handle the completion status of a message send. + */ +static void handle_cmplt(int completion_status, struct bau_desc *bau_desc, + struct bau_control *bcp, struct bau_control *hmaster, + struct ptc_stats *stat) +{ + if (completion_status == FLUSH_RETRY_PLUGGED) + destination_plugged(bau_desc, bcp, hmaster, stat); + else if (completion_status == FLUSH_RETRY_TIMEOUT) + destination_timeout(bau_desc, bcp, hmaster, stat); +} + +/* + * Send a broadcast and wait for it to complete. + * + * The flush_mask contains the cpus the broadcast is to be sent to including + * cpus that are on the local uvhub. + * + * Returns 0 if all flushing represented in the mask was done. + * Returns 1 if it gives up entirely and the original cpu mask is to be + * returned to the kernel. + */ +int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp) +{ + int seq_number = 0; + int completion_stat = 0; + int uv1 = 0; + long try = 0; + unsigned long index; + cycles_t time1; + cycles_t time2; + struct ptc_stats *stat = bcp->statp; + struct bau_control *hmaster = bcp->uvhub_master; + struct uv1_bau_msg_header *uv1_hdr = NULL; + struct uv2_bau_msg_header *uv2_hdr = NULL; + struct bau_desc *bau_desc; + + if (bcp->uvhub_version == 1) + uv1_throttle(hmaster, stat); + + while (hmaster->uvhub_quiesce) + cpu_relax(); + time1 = get_cycles(); do { - tries++; - index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | - cpu; - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); - completion_status = uv_wait_completion(bau_desc, mmr_offset, - right_shift); - } while (completion_status == FLUSH_RETRY); + bau_desc = bcp->descriptor_base; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + if (bcp->uvhub_version == 1) { + uv1 = 1; + uv1_hdr = &bau_desc->header.uv1_hdr; + } else + uv2_hdr = &bau_desc->header.uv2_hdr; + if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) { + if (uv1) + uv1_hdr->msg_type = MSG_REGULAR; + else + uv2_hdr->msg_type = MSG_REGULAR; + seq_number = bcp->message_number++; + } else { + if (uv1) + uv1_hdr->msg_type = MSG_RETRY; + else + uv2_hdr->msg_type = MSG_RETRY; + stat->s_retry_messages++; + } + + if (uv1) + uv1_hdr->sequence = seq_number; + else + uv2_hdr->sequence = seq_number; + index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc; + bcp->send_message = get_cycles(); + + write_mmr_activation(index); + + try++; + completion_stat = wait_completion(bau_desc, bcp, try); + /* UV2: wait_completion() may change the bcp->using_desc */ + + handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat); + + if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { + bcp->ipi_attempts = 0; + completion_stat = FLUSH_GIVEUP; + break; + } + cpu_relax(); + } while ((completion_stat == FLUSH_RETRY_PLUGGED) || + (completion_stat == FLUSH_RETRY_BUSYBUG) || + (completion_stat == FLUSH_RETRY_TIMEOUT)); + time2 = get_cycles(); - __get_cpu_var(ptcstats).sflush += (time2 - time1); - if (tries > 1) - __get_cpu_var(ptcstats).retriesok++; - if (completion_status == FLUSH_GIVEUP) { - /* - * Cause the caller to do an IPI-style TLB shootdown on - * the cpu's, all of which are still in the mask. - */ - __get_cpu_var(ptcstats).ptc_i++; - return flush_mask; - } + count_max_concurr(completion_stat, bcp, hmaster); - /* - * Success, so clear the remote cpu's from the mask so we don't - * use the IPI method of shootdown on them. - */ - for_each_cpu(bit, flush_mask) { - pnode = uv_cpu_to_pnode(bit); - if (pnode == this_pnode) - continue; - cpumask_clear_cpu(bit, flush_mask); + while (hmaster->uvhub_quiesce) + cpu_relax(); + + atomic_dec(&hmaster->active_descriptor_count); + + record_send_stats(time1, time2, bcp, stat, completion_stat, try); + + if (completion_stat == FLUSH_GIVEUP) + /* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */ + return 1; + return 0; +} + +/* + * The BAU is disabled. When the disabled time period has expired, the cpu + * that disabled it must re-enable it. + * Return 0 if it is re-enabled for all cpus. + */ +static int check_enable(struct bau_control *bcp, struct ptc_stats *stat) +{ + int tcpu; + struct bau_control *tbcp; + + if (bcp->set_bau_off) { + if (get_cycles() >= bcp->set_bau_on_time) { + stat->s_bau_reenabled++; + baudisabled = 0; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 0; + tbcp->period_requests = 0; + tbcp->period_time = 0; + } + return 0; + } } - if (!cpumask_empty(flush_mask)) - return flush_mask; - return NULL; + return -1; } -static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); +static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs, + int remotes, struct bau_desc *bau_desc) +{ + stat->s_requestor++; + stat->s_ntargcpu += remotes + locals; + stat->s_ntargremotes += remotes; + stat->s_ntarglocals += locals; + + /* uvhub statistics */ + hubs = bau_uvhub_weight(&bau_desc->distribution); + if (locals) { + stat->s_ntarglocaluvhub++; + stat->s_ntargremoteuvhub += (hubs - 1); + } else + stat->s_ntargremoteuvhub += hubs; + + stat->s_ntarguvhub += hubs; + + if (hubs >= 16) + stat->s_ntarguvhub16++; + else if (hubs >= 8) + stat->s_ntarguvhub8++; + else if (hubs >= 4) + stat->s_ntarguvhub4++; + else if (hubs >= 2) + stat->s_ntarguvhub2++; + else + stat->s_ntarguvhub1++; +} + +/* + * Translate a cpu mask to the uvhub distribution mask in the BAU + * activation descriptor. + */ +static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp, + struct bau_desc *bau_desc, int *localsp, int *remotesp) +{ + int cpu; + int pnode; + int cnt = 0; + struct hub_and_pnode *hpp; -/** - * uv_flush_tlb_others - globally purge translation cache of a virtual - * address or all TLB's + for_each_cpu(cpu, flush_mask) { + /* + * The distribution vector is a bit map of pnodes, relative + * to the partition base pnode (and the partition base nasid + * in the header). + * Translate cpu to pnode and hub using a local memory array. + */ + hpp = &bcp->socket_master->thp[cpu]; + pnode = hpp->pnode - bcp->partition_base_pnode; + bau_uvhub_set(pnode, &bau_desc->distribution); + cnt++; + if (hpp->uvhub == bcp->uvhub) + (*localsp)++; + else + (*remotesp)++; + } + if (!cnt) + return 1; + return 0; +} + +/* + * globally purge translation cache of a virtual address or all TLB's * @cpumask: mask of all cpu's in which the address is to be removed * @mm: mm_struct containing virtual address range * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) @@ -321,8 +1058,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_ * The caller has derived the cpumask from the mm_struct. This function * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) * - * The cpumask is converted into a nodemask of the nodes containing - * the cpus. + * The cpumask is converted into a uvhubmask of the uvhubs containing + * those cpus. * * Note that this function should be called with preemption disabled. * @@ -330,56 +1067,134 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_ * Returns pointer to cpumask if some remote flushing remains to be * done. The returned pointer is valid till preemption is re-enabled. */ -const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, - struct mm_struct *mm, - unsigned long va, unsigned int cpu) +const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long va, + unsigned int cpu) +{ + int locals = 0; + int remotes = 0; + int hubs = 0; + struct bau_desc *bau_desc; + struct cpumask *flush_mask; + struct ptc_stats *stat; + struct bau_control *bcp; + + /* kernel was booted 'nobau' */ + if (nobau) + return cpumask; + + bcp = &per_cpu(bau_control, cpu); + stat = bcp->statp; + + /* bau was disabled due to slow response */ + if (bcp->baudisabled) { + if (check_enable(bcp, stat)) + return cpumask; + } + + /* + * Each sending cpu has a per-cpu mask which it fills from the caller's + * cpu mask. All cpus are converted to uvhubs and copied to the + * activation descriptor. + */ + flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); + /* don't actually do a shootdown of the local cpu */ + cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); + + if (cpu_isset(cpu, *cpumask)) + stat->s_ntargself++; + + bau_desc = bcp->descriptor_base; + bau_desc += (ITEMS_PER_DESC * bcp->using_desc); + bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); + if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes)) + return NULL; + + record_send_statistics(stat, locals, hubs, remotes, bau_desc); + + bau_desc->payload.address = va; + bau_desc->payload.sending_cpu = cpu; + /* + * uv_flush_send_and_wait returns 0 if all cpu's were messaged, + * or 1 if it gave up and the original cpumask should be returned. + */ + if (!uv_flush_send_and_wait(flush_mask, bcp)) + return NULL; + else + return cpumask; +} + +/* + * Search the message queue for any 'other' message with the same software + * acknowledge resource bit vector. + */ +struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg, + struct bau_control *bcp, unsigned char swack_vec) { - struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); - int i; - int bit; - int pnode; - int uv_cpu; - int this_pnode; - int locals = 0; - struct bau_desc *bau_desc; + struct bau_pq_entry *msg_next = msg + 1; - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + while ((msg_next->swack_vec != 0) && (msg_next != msg)) { + if (msg_next->swack_vec == swack_vec) + return msg_next; + msg_next++; + if (msg_next > bcp->queue_last) + msg_next = bcp->queue_first; + } + return NULL; +} - uv_cpu = uv_blade_processor_id(); - this_pnode = uv_hub_info->pnode; - bau_desc = __get_cpu_var(bau_control).descriptor_base; - bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; - - bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - - i = 0; - for_each_cpu(bit, flush_mask) { - pnode = uv_cpu_to_pnode(bit); - BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1)); - if (pnode == this_pnode) { - locals++; - continue; - } - bau_node_set(pnode - uv_partition_base_pnode, - &bau_desc->distribution); - i++; +/* + * UV2 needs to work around a bug in which an arriving message has not + * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register. + * Such a message must be ignored. + */ +void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp) +{ + unsigned long mmr_image; + unsigned char swack_vec; + struct bau_pq_entry *msg = mdp->msg; + struct bau_pq_entry *other_msg; + + mmr_image = read_mmr_sw_ack(); + swack_vec = msg->swack_vec; + + if ((swack_vec & mmr_image) == 0) { + /* + * This message was assigned a swack resource, but no + * reserved acknowlegment is pending. + * The bug has prevented this message from setting the MMR. + * And no other message has used the same sw_ack resource. + * Do the requested shootdown but do not reply to the msg. + * (the 0 means make no acknowledge) + */ + bau_process_message(mdp, bcp, 0); + return; } - if (i == 0) { + + /* + * Some message has set the MMR 'pending' bit; it might have been + * another message. Look for that message. + */ + other_msg = find_another_by_swack(msg, bcp, msg->swack_vec); + if (other_msg) { + /* There is another. Do not ack the current one. */ + bau_process_message(mdp, bcp, 0); /* - * no off_node flushing; return status for local node + * Let the natural processing of that message acknowledge + * it. Don't get the processing of sw_ack's out of order. */ - if (locals) - return flush_mask; - else - return NULL; + return; } - __get_cpu_var(ptcstats).requestor++; - __get_cpu_var(ptcstats).ntargeted += i; - bau_desc->payload.address = va; - bau_desc->payload.sending_cpu = cpu; + /* + * There is no other message using this sw_ack, so it is safe to + * acknowledge it. + */ + bau_process_message(mdp, bcp, 1); - return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask); + return; } /* @@ -388,128 +1203,113 @@ const struct cpumask *uv_flush_tlb_other * * We received a broadcast assist message. * - * Interrupts may have been disabled; this interrupt could represent + * Interrupts are disabled; this interrupt could represent * the receipt of several messages. * - * All cores/threads on this node get this interrupt. - * The last one to see it does the s/w ack. + * All cores/threads on this hub get this interrupt. + * The last one to see it does the software ack. * (the resource will not be freed until noninterruptable cpus see this - * interrupt; hardware will timeout the s/w ack and reply ERROR) + * interrupt; hardware may timeout the s/w ack and reply ERROR) */ void uv_bau_message_interrupt(struct pt_regs *regs) { - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; - struct bau_payload_queue_entry *msg; - struct pt_regs *old_regs = set_irq_regs(regs); - cycles_t time1; - cycles_t time2; - int msg_slot; - int sw_ack_slot; - int fw; int count = 0; - unsigned long local_pnode; + cycles_t time_start; + struct bau_pq_entry *msg; + struct bau_control *bcp; + struct ptc_stats *stat; + struct msg_desc msgdesc; ack_APIC_irq(); - exit_idle(); - irq_enter(); + time_start = get_cycles(); - time1 = get_cycles(); - - local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); + bcp = &per_cpu(bau_control, smp_processor_id()); + stat = bcp->statp; - va_queue_first = __get_cpu_var(bau_control).va_queue_first; - va_queue_last = __get_cpu_var(bau_control).va_queue_last; + msgdesc.queue_first = bcp->queue_first; + msgdesc.queue_last = bcp->queue_last; - msg = __get_cpu_var(bau_control).bau_msg_head; - while (msg->sw_ack_vector) { + msg = bcp->bau_msg_head; + while (msg->swack_vec) { count++; - fw = msg->sw_ack_vector; - msg_slot = msg - va_queue_first; - sw_ack_slot = ffs(fw) - 1; - uv_bau_process_message(msg, msg_slot, sw_ack_slot); + msgdesc.msg_slot = msg - msgdesc.queue_first; + msgdesc.msg = msg; + if (bcp->uvhub_version == 2) + process_uv2_message(&msgdesc, bcp); + else + bau_process_message(&msgdesc, bcp, 1); msg++; - if (msg > va_queue_last) - msg = va_queue_first; - __get_cpu_var(bau_control).bau_msg_head = msg; + if (msg > msgdesc.queue_last) + msg = msgdesc.queue_first; + bcp->bau_msg_head = msg; } + stat->d_time += (get_cycles() - time_start); if (!count) - __get_cpu_var(ptcstats).nomsg++; + stat->d_nomsg++; else if (count > 1) - __get_cpu_var(ptcstats).multmsg++; - - time2 = get_cycles(); - __get_cpu_var(ptcstats).dflush += (time2 - time1); - - irq_exit(); - set_irq_regs(old_regs); + stat->d_multmsg++; } /* - * uv_enable_timeouts - * - * Each target blade (i.e. blades that have cpu's) needs to have + * Each target uvhub (i.e. a uvhub that has cpu's) needs to have * shootdown message timeouts enabled. The timeout does not cause * an interrupt, but causes an error message to be returned to * the sender. */ -static void uv_enable_timeouts(void) +static void __init enable_timeouts(void) { - int blade; - int nblades; + int uvhub; + int nuvhubs; int pnode; unsigned long mmr_image; - nblades = uv_num_possible_blades(); + nuvhubs = uv_num_possible_blades(); - for (blade = 0; blade < nblades; blade++) { - if (!uv_blade_nr_possible_cpus(blade)) + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + if (!uv_blade_nr_possible_cpus(uvhub)) continue; - pnode = uv_blade_to_pnode(blade); - mmr_image = - uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); + pnode = uv_blade_to_pnode(uvhub); + mmr_image = read_mmr_misc_control(pnode); /* * Set the timeout period and then lock it in, in three * steps; captures and locks in the period. * * To program the period, the SOFT_ACK_MODE must be off. */ - mmr_image &= ~((unsigned long)1 << - UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); + mmr_image &= ~(1L << SOFTACK_MSHIFT); + write_mmr_misc_control(pnode, mmr_image); /* * Set the 4-bit period. */ - mmr_image &= ~((unsigned long)0xf << - UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); - mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << - UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); + mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT); + mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT); + write_mmr_misc_control(pnode, mmr_image); /* + * UV1: * Subsequent reversals of the timebase bit (3) cause an * immediate timeout of one or all INTD resources as * indicated in bits 2:0 (7 causes all of them to timeout). */ - mmr_image |= ((unsigned long)1 << - UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); - uv_write_global_mmr64 - (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); + mmr_image |= (1L << SOFTACK_MSHIFT); + if (is_uv2_hub()) { + mmr_image &= ~(1L << UV2_LEG_SHFT); + mmr_image |= (1L << UV2_EXT_SHFT); + } + write_mmr_misc_control(pnode, mmr_image); } } -static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) +static void *ptc_seq_start(struct seq_file *file, loff_t *offset) { if (*offset < num_possible_cpus()) return offset; return NULL; } -static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) +static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) { (*offset)++; if (*offset < num_possible_cpus()) @@ -517,120 +1317,297 @@ static void *uv_ptc_seq_next(struct seq_ return NULL; } -static void uv_ptc_seq_stop(struct seq_file *file, void *data) +static void ptc_seq_stop(struct seq_file *file, void *data) { } +static inline unsigned long long usec_2_cycles(unsigned long microsec) +{ + unsigned long ns; + unsigned long long cyc; + + ns = microsec * 1000; + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); + return cyc; +} + /* - * Display the statistics thru /proc - * data points to the cpu number + * Display the statistics thru /proc/sgi_uv/ptc_statistics + * 'data' points to the cpu number + * Note: see the descriptions in stat_description[]. */ -static int uv_ptc_seq_show(struct seq_file *file, void *data) +static int ptc_seq_show(struct seq_file *file, void *data) { struct ptc_stats *stat; int cpu; cpu = *(loff_t *)data; - if (!cpu) { seq_printf(file, - "# cpu requestor requestee one all sretry dretry ptc_i "); + "# cpu sent stime self locals remotes ncpus localhub "); + seq_printf(file, + "remotehub numuvhubs numuvhubs16 numuvhubs8 "); + seq_printf(file, + "numuvhubs4 numuvhubs2 numuvhubs1 dto snacks retries rok "); + seq_printf(file, + "resetp resett giveup sto bz throt swack recv rtime "); seq_printf(file, - "sw_ack sflush dflush sok dnomsg dmult starget\n"); + "all one mult none retry canc nocan reset rcan "); + seq_printf(file, + "disable enable wars warshw warwaits\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); - seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", - cpu, stat->requestor, - stat->requestee, stat->onetlb, stat->alltlb, - stat->s_retry, stat->d_retry, stat->ptc_i); - seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", - uv_read_global_mmr64(uv_cpu_to_pnode(cpu), - UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), - stat->sflush, stat->dflush, - stat->retriesok, stat->nomsg, - stat->multmsg, stat->ntargeted); - } + /* source side statistics */ + seq_printf(file, + "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + cpu, stat->s_requestor, cycles_2_us(stat->s_time), + stat->s_ntargself, stat->s_ntarglocals, + stat->s_ntargremotes, stat->s_ntargcpu, + stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, + stat->s_ntarguvhub, stat->s_ntarguvhub16); + seq_printf(file, "%ld %ld %ld %ld %ld %ld ", + stat->s_ntarguvhub8, stat->s_ntarguvhub4, + stat->s_ntarguvhub2, stat->s_ntarguvhub1, + stat->s_dtimeout, stat->s_strongnacks); + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", + stat->s_retry_messages, stat->s_retriesok, + stat->s_resets_plug, stat->s_resets_timeout, + stat->s_giveup, stat->s_stimeout, + stat->s_busy, stat->s_throttles); + /* destination side statistics */ + seq_printf(file, + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)), + stat->d_requestee, cycles_2_us(stat->d_time), + stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, + stat->d_nomsg, stat->d_retries, stat->d_canceled, + stat->d_nocanceled, stat->d_resets, + stat->d_rcanceled); + seq_printf(file, "%ld %ld %ld %ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled, + stat->s_uv2_wars, stat->s_uv2_wars_hw, + stat->s_uv2_war_waits); + } return 0; } /* + * Display the tunables thru debugfs + */ +static ssize_t tunables_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char *buf; + int ret; + + buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", + "max_concur plugged_delay plugsb4reset", + "timeoutsb4reset ipi_reset_limit complete_threshold", + "congested_response_us congested_reps congested_period", + max_concurr, plugged_delay, plugsb4reset, + timeoutsb4reset, ipi_reset_limit, complete_threshold, + congested_respns_us, congested_reps, congested_period); + + if (!buf) + return -ENOMEM; + + ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); + kfree(buf); + return ret; +} + +/* + * handle a write to /proc/sgi_uv/ptc_statistics + * -1: reset the statistics * 0: display meaning of the statistics - * >0: retry limit */ -static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, - size_t count, loff_t *data) +static ssize_t ptc_proc_write(struct file *file, const char __user *user, + size_t count, loff_t *data) { - long newmode; + int cpu; + int i; + int elements; + long input_arg; char optstr[64]; + struct ptc_stats *stat; if (count == 0 || count > sizeof(optstr)) return -EINVAL; if (copy_from_user(optstr, user, count)) return -EFAULT; optstr[count - 1] = '\0'; - if (strict_strtoul(optstr, 10, &newmode) < 0) { + + if (strict_strtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; } - if (newmode == 0) { + if (input_arg == 0) { + elements = sizeof(stat_description)/sizeof(*stat_description); printk(KERN_DEBUG "# cpu: cpu number\n"); - printk(KERN_DEBUG - "requestor: times this cpu was the flush requestor\n"); - printk(KERN_DEBUG - "requestee: times this cpu was requested to flush its TLBs\n"); - printk(KERN_DEBUG - "one: times requested to flush a single address\n"); - printk(KERN_DEBUG - "all: times requested to flush all TLB's\n"); - printk(KERN_DEBUG - "sretry: number of retries of source-side timeouts\n"); - printk(KERN_DEBUG - "dretry: number of retries of destination-side timeouts\n"); - printk(KERN_DEBUG - "ptc_i: times UV fell through to IPI-style flushes\n"); - printk(KERN_DEBUG - "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); - printk(KERN_DEBUG - "sflush_us: cycles spent in uv_flush_tlb_others()\n"); - printk(KERN_DEBUG - "dflush_us: cycles spent in handling flush requests\n"); - printk(KERN_DEBUG "sok: successes on retry\n"); - printk(KERN_DEBUG "dnomsg: interrupts with no message\n"); - printk(KERN_DEBUG - "dmult: interrupts with multiple messages\n"); - printk(KERN_DEBUG "starget: nodes targeted\n"); - } else { - uv_bau_retry_limit = newmode; - printk(KERN_DEBUG "timeout retry limit:%d\n", - uv_bau_retry_limit); + printk(KERN_DEBUG "Sender statistics:\n"); + for (i = 0; i < elements; i++) + printk(KERN_DEBUG "%s\n", stat_description[i]); + } else if (input_arg == -1) { + for_each_present_cpu(cpu) { + stat = &per_cpu(ptcstats, cpu); + memset(stat, 0, sizeof(struct ptc_stats)); + } + } + + return count; +} + +static int local_atoi(const char *name) +{ + int val = 0; + + for (;; name++) { + switch (*name) { + case '0' ... '9': + val = 10*val+(*name-'0'); + break; + default: + return val; + } + } +} + +/* + * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables. + * Zero values reset them to defaults. + */ +static int parse_tunables_write(struct bau_control *bcp, char *instr, + int count) +{ + char *p; + char *q; + int cnt = 0; + int val; + int e = sizeof(tunables) / sizeof(*tunables); + + p = instr + strspn(instr, WHITESPACE); + q = p; + for (; *p; p = q + strspn(q, WHITESPACE)) { + q = p + strcspn(p, WHITESPACE); + cnt++; + if (q == p) + break; + } + if (cnt != e) { + printk(KERN_INFO "bau tunable error: should be %d values\n", e); + return -EINVAL; + } + + p = instr + strspn(instr, WHITESPACE); + q = p; + for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { + q = p + strcspn(p, WHITESPACE); + val = local_atoi(p); + switch (cnt) { + case 0: + if (val == 0) { + max_concurr = MAX_BAU_CONCURRENT; + max_concurr_const = MAX_BAU_CONCURRENT; + continue; + } + if (val < 1 || val > bcp->cpus_in_uvhub) { + printk(KERN_DEBUG + "Error: BAU max concurrent %d is invalid\n", + val); + return -EINVAL; + } + max_concurr = val; + max_concurr_const = val; + continue; + default: + if (val == 0) + *tunables[cnt].tunp = tunables[cnt].deflt; + else + *tunables[cnt].tunp = val; + continue; + } + if (q == p) + break; } + return 0; +} + +/* + * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables) + */ +static ssize_t tunables_write(struct file *file, const char __user *user, + size_t count, loff_t *data) +{ + int cpu; + int ret; + char instr[100]; + struct bau_control *bcp; + if (count == 0 || count > sizeof(instr)-1) + return -EINVAL; + if (copy_from_user(instr, user, count)) + return -EFAULT; + + instr[count] = '\0'; + + cpu = get_cpu(); + bcp = &per_cpu(bau_control, cpu); + ret = parse_tunables_write(bcp, instr, count); + put_cpu(); + if (ret) + return ret; + + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->max_concurr = max_concurr; + bcp->max_concurr_const = max_concurr; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->cong_response_us = congested_respns_us; + bcp->cong_reps = congested_reps; + bcp->cong_period = congested_period; + } return count; } static const struct seq_operations uv_ptc_seq_ops = { - .start = uv_ptc_seq_start, - .next = uv_ptc_seq_next, - .stop = uv_ptc_seq_stop, - .show = uv_ptc_seq_show + .start = ptc_seq_start, + .next = ptc_seq_next, + .stop = ptc_seq_stop, + .show = ptc_seq_show }; -static int uv_ptc_proc_open(struct inode *inode, struct file *file) +static int ptc_proc_open(struct inode *inode, struct file *file) { return seq_open(file, &uv_ptc_seq_ops); } +static int tunables_open(struct inode *inode, struct file *file) +{ + return 0; +} + static const struct file_operations proc_uv_ptc_operations = { - .open = uv_ptc_proc_open, + .open = ptc_proc_open, .read = seq_read, - .write = uv_ptc_proc_write, + .write = ptc_proc_write, .llseek = seq_lseek, .release = seq_release, }; +static const struct file_operations tunables_fops = { + .open = tunables_open, + .read = tunables_read, + .write = tunables_write, + .llseek = default_llseek, +}; + static int __init uv_ptc_init(void) { struct proc_dir_entry *proc_uv_ptc; @@ -645,222 +1622,502 @@ static int __init uv_ptc_init(void) UV_PTC_BASENAME); return -EINVAL; } + + tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); + if (!tunables_dir) { + printk(KERN_ERR "unable to create debugfs directory %s\n", + UV_BAU_TUNABLES_DIR); + return -EINVAL; + } + tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, + tunables_dir, NULL, &tunables_fops); + if (!tunables_file) { + printk(KERN_ERR "unable to create debugfs file %s\n", + UV_BAU_TUNABLES_FILE); + return -EINVAL; + } return 0; } /* - * begin the initialization of the per-blade control structures + * Initialize the sending side's sending buffers. */ -static struct bau_control * __init uv_table_bases_init(int blade, int node) +static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; - struct bau_msg_status *msp; - struct bau_control *bau_tabp; - - bau_tabp = - kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node); - BUG_ON(!bau_tabp); - - bau_tabp->msg_statuses = - kmalloc_node(sizeof(struct bau_msg_status) * - DEST_Q_SIZE, GFP_KERNEL, node); - BUG_ON(!bau_tabp->msg_statuses); - - for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++) - bau_cpubits_clear(&msp->seen_by, (int) - uv_blade_nr_possible_cpus(blade)); + int cpu; + int uv1 = 0; + unsigned long gpa; + unsigned long m; + unsigned long n; + size_t dsize; + struct bau_desc *bau_desc; + struct bau_desc *bd2; + struct uv1_bau_msg_header *uv1_hdr; + struct uv2_bau_msg_header *uv2_hdr; + struct bau_control *bcp; - uv_bau_table_bases[blade] = bau_tabp; + /* + * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC) + * per cpu; and one per cpu on the uvhub (ADP_SZ) + */ + dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC; + bau_desc = kmalloc_node(dsize, GFP_KERNEL, node); + BUG_ON(!bau_desc); + + gpa = uv_gpa(bau_desc); + n = uv_gpa_to_gnode(gpa); + m = uv_gpa_to_offset(gpa); + if (is_uv1_hub()) + uv1 = 1; - return bau_tabp; + /* the 14-bit pnode */ + write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); + /* + * Initializing all 8 (ITEMS_PER_DESC) descriptors for each + * cpu even though we only use the first one; one descriptor can + * describe a broadcast to 256 uv hubs. + */ + for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) { + memset(bd2, 0, sizeof(struct bau_desc)); + if (uv1) { + uv1_hdr = &bd2->header.uv1_hdr; + uv1_hdr->swack_flag = 1; + /* + * The base_dest_nasid set in the message header + * is the nasid of the first uvhub in the partition. + * The bit map will indicate destination pnode numbers + * relative to that base. They may not be consecutive + * if nasid striding is being used. + */ + uv1_hdr->base_dest_nasid = + UV_PNODE_TO_NASID(base_pnode); + uv1_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv1_hdr->command = UV_NET_ENDPOINT_INTD; + uv1_hdr->int_both = 1; + /* + * all others need to be set to zero: + * fairness chaining multilevel count replied_to + */ + } else { + uv2_hdr = &bd2->header.uv2_hdr; + uv2_hdr->swack_flag = 1; + uv2_hdr->base_dest_nasid = + UV_PNODE_TO_NASID(base_pnode); + uv2_hdr->dest_subnodeid = UV_LB_SUBNODEID; + uv2_hdr->command = UV_NET_ENDPOINT_INTD; + } + } + for_each_present_cpu(cpu) { + if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) + continue; + bcp = &per_cpu(bau_control, cpu); + bcp->descriptor_base = bau_desc; + } } /* - * finish the initialization of the per-blade control structures + * initialize the destination side's receiving buffers + * entered for each uvhub in the partition + * - node is first node (kernel memory notion) on the uvhub + * - pnode is the uvhub's physical identifier */ -static void __init -uv_table_bases_finish(int blade, - struct bau_control *bau_tablesp, - struct bau_desc *adp) +static void pq_init(int node, int pnode) { - struct bau_control *bcp; int cpu; + size_t plsize; + char *cp; + void *vp; + unsigned long pn; + unsigned long first; + unsigned long pn_first; + unsigned long last; + struct bau_pq_entry *pqp; + struct bau_control *bcp; + + plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry); + vp = kmalloc_node(plsize, GFP_KERNEL, node); + pqp = (struct bau_pq_entry *)vp; + BUG_ON(!pqp); + + cp = (char *)pqp + 31; + pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5); for_each_present_cpu(cpu) { - if (blade != uv_cpu_to_blade_id(cpu)) + if (pnode != uv_cpu_to_pnode(cpu)) continue; - - bcp = (struct bau_control *)&per_cpu(bau_control, cpu); - bcp->bau_msg_head = bau_tablesp->va_queue_first; - bcp->va_queue_first = bau_tablesp->va_queue_first; - bcp->va_queue_last = bau_tablesp->va_queue_last; - bcp->msg_statuses = bau_tablesp->msg_statuses; - bcp->descriptor_base = adp; + /* for every cpu on this pnode: */ + bcp = &per_cpu(bau_control, cpu); + bcp->queue_first = pqp; + bcp->bau_msg_head = pqp; + bcp->queue_last = pqp + (DEST_Q_SIZE - 1); } + /* + * need the gnode of where the memory was really allocated + */ + pn = uv_gpa_to_gnode(uv_gpa(pqp)); + first = uv_physnodeaddr(pqp); + pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; + last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); + write_mmr_payload_first(pnode, pn_first); + write_mmr_payload_tail(pnode, first); + write_mmr_payload_last(pnode, last); + write_gmmr_sw_ack(pnode, 0xffffUL); + + /* in effect, all msg_type's are set to MSG_NOOP */ + memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE); } /* - * initialize the sending side's sending buffers + * Initialization of each UV hub's structures */ -static struct bau_desc * __init -uv_activation_descriptor_init(int node, int pnode) +static void __init init_uvhub(int uvhub, int vector, int base_pnode) { - int i; - unsigned long pa; - unsigned long m; - unsigned long n; - struct bau_desc *adp; - struct bau_desc *ad2; + int node; + int pnode; + unsigned long apicid; + + node = uvhub_to_first_node(uvhub); + pnode = uv_blade_to_pnode(uvhub); + + activation_descriptor_init(node, pnode, base_pnode); + pq_init(node, pnode); /* - * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) - * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade + * The below initialization can't be in firmware because the + * messaging IRQ will be determined by the OS. */ - adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* - UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); - BUG_ON(!adp); + apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; + write_mmr_data_config(pnode, ((apicid << 32) | vector)); +} - pa = uv_gpa(adp); /* need the real nasid*/ - n = uv_gpa_to_pnode(pa); - m = pa & uv_mmask; +/* + * We will set BAU_MISC_CONTROL with a timeout period. + * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. + * So the destination timeout period has to be calculated from them. + */ +static int calculate_destination_timeout(void) +{ + unsigned long mmr_image; + int mult1; + int mult2; + int index; + int base; + int ret; + unsigned long ts_ns; + + if (is_uv1_hub()) { + mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; + mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; + mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); + mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; + base = timeout_base_ns[index]; + ts_ns = base * mult1 * mult2; + ret = ts_ns / 1000; + } else { + /* 4 bits 0/1 for 10/80us base, 3 bits of multiplier */ + mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL); + mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT; + if (mmr_image & (1L << UV2_ACK_UNITS_SHFT)) + base = 80; + else + base = 10; + mult1 = mmr_image & UV2_ACK_MASK; + ret = mult1 * base; + } + return ret; +} - uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, - (n << UV_DESC_BASE_PNODE_SHIFT | m)); +static void __init init_per_cpu_tunables(void) +{ + int cpu; + struct bau_control *bcp; - /* - * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each - * cpu even though we only use the first one; one descriptor can - * describe a broadcast to 256 nodes. - */ - for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); - i++, ad2++) { - memset(ad2, 0, sizeof(struct bau_desc)); - ad2->header.sw_ack_flag = 1; - /* - * base_dest_nodeid is the first node in the partition, so - * the bit map will indicate partition-relative node numbers. - * note that base_dest_nodeid is actually a nasid. - */ - ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; - ad2->header.dest_subnodeid = 0x10; /* the LB */ - ad2->header.command = UV_NET_ENDPOINT_INTD; - ad2->header.int_both = 1; - /* - * all others need to be set to zero: - * fairness chaining multilevel count replied_to - */ + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->baudisabled = 0; + bcp->statp = &per_cpu(ptcstats, cpu); + /* time interval to catch a hardware stay-busy bug */ + bcp->timeout_interval = usec_2_cycles(2*timeout_us); + bcp->max_concurr = max_concurr; + bcp->max_concurr_const = max_concurr; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->cong_response_us = congested_respns_us; + bcp->cong_reps = congested_reps; + bcp->cong_period = congested_period; + bcp->clocks_per_100_usec = usec_2_cycles(100); + spin_lock_init(&bcp->queue_lock); + spin_lock_init(&bcp->uvhub_lock); } - return adp; } /* - * initialize the destination side's receiving buffers + * Scan all cpus to collect blade and socket summaries. */ -static struct bau_payload_queue_entry * __init -uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) +static int __init get_cpu_topology(int base_pnode, + struct uvhub_desc *uvhub_descs, + unsigned char *uvhub_mask) { - struct bau_payload_queue_entry *pqp; - unsigned long pa; - int pn; - char *cp; + int cpu; + int pnode; + int uvhub; + int socket; + struct bau_control *bcp; + struct uvhub_desc *bdp; + struct socket_desc *sdp; - pqp = (struct bau_payload_queue_entry *) kmalloc_node( - (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), - GFP_KERNEL, node); - BUG_ON(!pqp); + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); - cp = (char *)pqp + 31; - pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); - bau_tablesp->va_queue_first = pqp; - /* - * need the pnode of where the memory was really allocated - */ - pa = uv_gpa(pqp); - pn = uv_gpa_to_pnode(pa); - uv_write_global_mmr64(pnode, - UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, - ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | - uv_physnodeaddr(pqp)); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, - uv_physnodeaddr(pqp)); - bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1); - uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, - (unsigned long) - uv_physnodeaddr(bau_tablesp->va_queue_last)); - memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); + memset(bcp, 0, sizeof(struct bau_control)); + + pnode = uv_cpu_hub_info(cpu)->pnode; + if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) { + printk(KERN_EMERG + "cpu %d pnode %d-%d beyond %d; BAU disabled\n", + cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE); + return 1; + } - return pqp; + bcp->osnode = cpu_to_node(cpu); + bcp->partition_base_pnode = base_pnode; + + uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; + *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); + bdp = &uvhub_descs[uvhub]; + + bdp->num_cpus++; + bdp->uvhub = uvhub; + bdp->pnode = pnode; + + /* kludge: 'assuming' one node per socket, and assuming that + disabling a socket just leaves a gap in node numbers */ + socket = bcp->osnode & 1; + bdp->socket_mask |= (1 << socket); + sdp = &bdp->socket[socket]; + sdp->cpu_number[sdp->num_cpus] = cpu; + sdp->num_cpus++; + if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { + printk(KERN_EMERG "%d cpus per socket invalid\n", + sdp->num_cpus); + return 1; + } + } + return 0; } /* - * Initialization of each UV blade's structures + * Each socket is to get a local array of pnodes/hubs. */ -static int __init uv_init_blade(int blade) +static void make_per_cpu_thp(struct bau_control *smaster) { - int node; - int pnode; - unsigned long pa; - unsigned long apicid; - struct bau_desc *adp; - struct bau_payload_queue_entry *pqp; - struct bau_control *bau_tablesp; - - node = blade_to_first_node(blade); - bau_tablesp = uv_table_bases_init(blade, node); - pnode = uv_blade_to_pnode(blade); - adp = uv_activation_descriptor_init(node, pnode); - pqp = uv_payload_queue_init(node, pnode, bau_tablesp); - uv_table_bases_finish(blade, bau_tablesp, adp); - /* - * the below initialization can't be in firmware because the - * messaging IRQ will be determined by the OS - */ - apicid = blade_to_first_apicid(blade); - pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); - if ((pa & 0xff) != UV_BAU_MESSAGE) { - uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, - ((apicid << 32) | UV_BAU_MESSAGE)); + int cpu; + size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus(); + + smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode); + memset(smaster->thp, 0, hpsz); + for_each_present_cpu(cpu) { + smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode; + smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; + } +} + +/* + * Each uvhub is to get a local cpumask. + */ +static void make_per_hub_cpumask(struct bau_control *hmaster) +{ + int sz = sizeof(cpumask_t); + + hmaster->cpumask = kzalloc_node(sz, GFP_KERNEL, hmaster->osnode); +} + +/* + * Initialize all the per_cpu information for the cpu's on a given socket, + * given what has been gathered into the socket_desc struct. + * And reports the chosen hub and socket masters back to the caller. + */ +static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp, + struct bau_control **smasterp, + struct bau_control **hmasterp) +{ + int i; + int cpu; + struct bau_control *bcp; + + for (i = 0; i < sdp->num_cpus; i++) { + cpu = sdp->cpu_number[i]; + bcp = &per_cpu(bau_control, cpu); + bcp->cpu = cpu; + if (i == 0) { + *smasterp = bcp; + if (!(*hmasterp)) + *hmasterp = bcp; + } + bcp->cpus_in_uvhub = bdp->num_cpus; + bcp->cpus_in_socket = sdp->num_cpus; + bcp->socket_master = *smasterp; + bcp->uvhub = bdp->uvhub; + if (is_uv1_hub()) + bcp->uvhub_version = 1; + else if (is_uv2_hub()) + bcp->uvhub_version = 2; + else { + printk(KERN_EMERG "uvhub version not 1 or 2\n"); + return 1; + } + bcp->uvhub_master = *hmasterp; + bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->using_desc = bcp->uvhub_cpu; + if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) { + printk(KERN_EMERG "%d cpus per uvhub invalid\n", + bcp->uvhub_cpu); + return 1; + } + } + return 0; +} + +/* + * Summarize the blade and socket topology into the per_cpu structures. + */ +static int __init summarize_uvhub_sockets(int nuvhubs, + struct uvhub_desc *uvhub_descs, + unsigned char *uvhub_mask) +{ + int socket; + int uvhub; + unsigned short socket_mask; + + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + struct uvhub_desc *bdp; + struct bau_control *smaster = NULL; + struct bau_control *hmaster = NULL; + + if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) + continue; + + bdp = &uvhub_descs[uvhub]; + socket_mask = bdp->socket_mask; + socket = 0; + while (socket_mask) { + struct socket_desc *sdp; + if ((socket_mask & 1)) { + sdp = &bdp->socket[socket]; + if (scan_sock(sdp, bdp, &smaster, &hmaster)) + return 1; + make_per_cpu_thp(smaster); + } + socket++; + socket_mask = (socket_mask >> 1); + } + make_per_hub_cpumask(hmaster); } return 0; } /* + * initialize the bau_control structure for each cpu + */ +static int __init init_per_cpu(int nuvhubs, int base_part_pnode) +{ + unsigned char *uvhub_mask; + void *vp; + struct uvhub_desc *uvhub_descs; + + timeout_us = calculate_destination_timeout(); + + vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); + uvhub_descs = (struct uvhub_desc *)vp; + memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); + uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); + + if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) + goto fail; + + if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask)) + goto fail; + + kfree(uvhub_descs); + kfree(uvhub_mask); + init_per_cpu_tunables(); + return 0; + +fail: + kfree(uvhub_descs); + kfree(uvhub_mask); + return 1; +} + +/* * Initialization of BAU-related structures */ static int __init uv_bau_init(void) { - int blade; - int nblades; + int uvhub; + int pnode; + int nuvhubs; int cur_cpu; + int cpus; + int vector; + cpumask_var_t *mask; if (!is_uv_system()) return 0; - for_each_possible_cpu(cur_cpu) - zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), - GFP_KERNEL, cpu_to_node(cur_cpu)); - - uv_bau_retry_limit = 1; - uv_mmask = (1UL << uv_hub_info->m_val) - 1; - nblades = uv_num_possible_blades(); - - uv_bau_table_bases = (struct bau_control **) - kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); - BUG_ON(!uv_bau_table_bases); - - uv_partition_base_pnode = 0x7fffffff; - for (blade = 0; blade < nblades; blade++) - if (uv_blade_nr_possible_cpus(blade) && - (uv_blade_to_pnode(blade) < uv_partition_base_pnode)) - uv_partition_base_pnode = uv_blade_to_pnode(blade); - for (blade = 0; blade < nblades; blade++) - if (uv_blade_nr_possible_cpus(blade)) - uv_init_blade(blade); + if (nobau) + return 0; + + for_each_possible_cpu(cur_cpu) { + mask = &per_cpu(uv_flush_tlb_mask, cur_cpu); + zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); + } + + nuvhubs = uv_num_possible_blades(); + spin_lock_init(&disable_lock); + congested_cycles = usec_2_cycles(congested_respns_us); + + uv_base_pnode = 0x7fffffff; + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + cpus = uv_blade_nr_possible_cpus(uvhub); + if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode)) + uv_base_pnode = uv_blade_to_pnode(uvhub); + } + + enable_timeouts(); + + if (init_per_cpu(nuvhubs, uv_base_pnode)) { + nobau = 1; + return 0; + } - alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); - uv_enable_timeouts(); + vector = UV_BAU_MESSAGE; + for_each_possible_blade(uvhub) + if (uv_blade_nr_possible_cpus(uvhub)) + init_uvhub(uvhub, vector, uv_base_pnode); + + alloc_intr_gate(vector, uv_bau_message_intr1); + + for_each_possible_blade(uvhub) { + if (uv_blade_nr_possible_cpus(uvhub)) { + unsigned long val; + unsigned long mmr; + pnode = uv_blade_to_pnode(uvhub); + /* INIT the bau */ + val = 1L << 63; + write_gmmr_activation(pnode, val); + mmr = 1; /* should be 1 to broadcast to both sockets */ + if (!is_uv1_hub()) + write_mmr_data_broadcast(pnode, mmr); + } + } return 0; } -__initcall(uv_bau_init); -__initcall(uv_ptc_init); +core_initcall(uv_bau_init); +fs_initcall(uv_ptc_init); diff -uprN linux-2.6.32/arch/x86/kernel/trace_clock.c linux-2.6.32.ovz/arch/x86/kernel/trace_clock.c --- linux-2.6.32/arch/x86/kernel/trace_clock.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/trace_clock.c 2015-03-10 13:23:58.000000000 -0700 @@ -0,0 +1,21 @@ +/* + * X86 trace clocks + */ +#include +#include +#include + +/* + * trace_clock_x86_tsc(): A clock that is just the cycle counter. + * + * Unlike the other clocks, this is not in nanoseconds. + */ +u64 notrace trace_clock_x86_tsc(void) +{ + u64 ret; + + rdtsc_barrier(); + rdtscll(ret); + + return ret; +} diff -uprN linux-2.6.32/arch/x86/kernel/tracepoint.c linux-2.6.32.ovz/arch/x86/kernel/tracepoint.c --- linux-2.6.32/arch/x86/kernel/tracepoint.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/tracepoint.c 2015-03-10 13:24:02.000000000 -0700 @@ -0,0 +1,61 @@ +/* + * Code for supporting irq vector tracepoints. + * + * Copyright (C) 2013 Seiji Aguchi + * + */ +#include +#include +#include + +atomic_t trace_idt_ctr = ATOMIC_INIT(0); +struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) trace_idt_table }; + +#ifndef CONFIG_X86_64 +gate_desc trace_idt_table[NR_VECTORS] __page_aligned_data + = { { { { 0, 0 } } }, }; +#endif + +static int trace_irq_vector_refcount; +static DEFINE_MUTEX(irq_vector_mutex); + +static void set_trace_idt_ctr(int val) +{ + atomic_set(&trace_idt_ctr, val); + /* Ensure the trace_idt_ctr is set before sending IPI */ + wmb(); +} + +static void switch_idt(void *arg) +{ + unsigned long flags; + + local_irq_save(flags); + load_current_idt(); + local_irq_restore(flags); +} + +void trace_irq_vector_regfunc(void) +{ + mutex_lock(&irq_vector_mutex); + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(1); + smp_call_function(switch_idt, NULL, 0); + switch_idt(NULL); + } + trace_irq_vector_refcount++; + mutex_unlock(&irq_vector_mutex); +} + +void trace_irq_vector_unregfunc(void) +{ + mutex_lock(&irq_vector_mutex); + trace_irq_vector_refcount--; + if (!trace_irq_vector_refcount) { + set_trace_idt_ctr(0); + smp_call_function(switch_idt, NULL, 0); + switch_idt(NULL); + } + mutex_unlock(&irq_vector_mutex); +} diff -uprN linux-2.6.32/arch/x86/kernel/trampoline.c linux-2.6.32.ovz/arch/x86/kernel/trampoline.c --- linux-2.6.32/arch/x86/kernel/trampoline.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/trampoline.c 2015-03-10 13:22:14.000000000 -0700 @@ -1,6 +1,7 @@ #include #include +#include #include #if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) @@ -39,3 +40,20 @@ unsigned long __trampinit setup_trampoli memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); } + +void __init setup_trampoline_page_table(void) +{ +#ifdef CONFIG_X86_32 + /* Copy kernel address range */ + clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, + KERNEL_PGD_BOUNDARY)); + + /* Initialize low mappings */ + clone_pgd_range(trampoline_pg_dir, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min_t(unsigned long, KERNEL_PGD_PTRS, + KERNEL_PGD_BOUNDARY)); +#endif +} diff -uprN linux-2.6.32/arch/x86/kernel/traps.c linux-2.6.32.ovz/arch/x86/kernel/traps.c --- linux-2.6.32/arch/x86/kernel/traps.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/traps.c 2015-06-23 04:16:16.000000000 -0700 @@ -82,6 +82,8 @@ EXPORT_SYMBOL_GPL(used_vectors); static int ignore_nmis; +int unknown_nmi_panic; + static inline void conditional_sti(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) @@ -109,11 +111,74 @@ static inline void preempt_conditional_c } #ifdef CONFIG_X86_32 -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) +static inline int +__compare_user_cs_desc(const struct desc_struct *desc1, + const struct desc_struct *desc2) +{ + return ((desc1->limit0 != desc2->limit0) || + (desc1->limit != desc2->limit) || + (desc1->base0 != desc2->base0) || + (desc1->base1 != desc2->base1) || + (desc1->base2 != desc2->base2)); +} + +/* + * lazy-check for CS validity on exec-shield binaries: + * + * the original non-exec stack patch was written by + * Solar Designer . Thanks! + */ +static int +check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) { - if (!user_mode_vm(regs)) - die(str, regs, err); + struct desc_struct *desc1, *desc2; + struct vm_area_struct *vma; + unsigned long limit; + + if (current->mm == NULL) + return 0; + + limit = -1UL; + if (current->mm->context.exec_limit != -1UL) { + limit = PAGE_SIZE; + spin_lock(¤t->mm->page_table_lock); + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + vma = get_gate_vma(current->mm); + if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + spin_unlock(¤t->mm->page_table_lock); + if (limit >= TASK_SIZE) + limit = -1UL; + current->mm->context.exec_limit = limit; + } + set_user_cs(¤t->mm->context.user_cs, limit); + + desc1 = ¤t->mm->context.user_cs; + desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; + + if (__compare_user_cs_desc(desc1, desc2)) { + /* + * The CS was not in sync - reload it and retry the + * instruction. If the instruction still faults then + * we won't hit this branch next time around. + */ + if (print_fatal_signals >= 2) { + printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", + error_code, error_code/8, regs->ip, + smp_processor_id()); + printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n", + current->mm->context.exec_limit, + desc1->a, desc1->b, desc2->a, desc2->b); + } + + load_user_cs_desc(cpu, current->mm); + + return 1; + } + + return 0; } #endif @@ -220,23 +285,11 @@ DO_ERROR_INFO(6, SIGILL, "invalid opcode DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -#endif DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) #ifdef CONFIG_X86_64 /* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; @@ -257,6 +310,27 @@ dotraplinkage void do_double_fault(struc } #endif +static int check_cpuid_fault(struct pt_regs *regs, long error_code) +{ + unsigned long addr; + unsigned short opcode; + + if (error_code != 0) + return 0; + + addr = convert_ip_to_linear(current, regs); + if (get_user(opcode, (unsigned short __user *)addr)) + return 0; + + if (opcode != 0xa20f) + return 0; + + do_cpuid_fault(regs); + + regs->ip += 2; + return 1; +} + dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { @@ -273,6 +347,32 @@ do_general_protection(struct pt_regs *re if (!user_mode(regs)) goto gp_in_kernel; + if (check_cpuid_fault(regs, error_code)) + return; + +#ifdef CONFIG_X86_32 +{ + int cpu; + int ok; + + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + + if (ok) + return; + + if (print_fatal_signals) { + printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", + error_code, error_code/8, regs->ip, smp_processor_id()); + printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n", + current->mm->context.exec_limit, + current->mm->context.user_cs.a, + current->mm->context.user_cs.b); + } +} +#endif /*CONFIG_X86_32*/ + tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; @@ -308,9 +408,20 @@ gp_in_kernel: die("general protection fault", regs, error_code); } +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static notrace __kprobes void mem_parity_error(unsigned char reason, struct pt_regs *regs) { + if (notify_die(DIE_NMIMEMPARITY, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) + return; + printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -340,6 +451,10 @@ io_check_error(unsigned char reason, str { unsigned long i; + if (notify_die(DIE_NMIIOCHECK, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) + return; + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); @@ -379,7 +494,7 @@ unknown_nmi_error(unsigned char reason, reason, smp_processor_id()); printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) + if (unknown_nmi_panic || panic_on_unrecovered_nmi) panic("NMI: Not continuing"); printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); @@ -400,16 +515,27 @@ static notrace __kprobes void default_do if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; + #ifdef CONFIG_X86_LOCAL_APIC + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; + +#ifndef CONFIG_LOCKUP_DETECTOR /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. */ - if (nmi_watchdog_tick(regs, reason)) + if (nmi_watchdog_tick(regs, reason) + + do_nmi_show_regs(regs, cpu)) return; if (!do_nmi_callback(regs, cpu)) +#endif /* !CONFIG_LOCKUP_DETECTOR */ + if (!do_nmi_show_regs(regs, cpu)) unknown_nmi_error(reason, regs); #else + if (do_nmi_show_regs(regs, cpu)) + return; unknown_nmi_error(reason, regs); #endif @@ -538,12 +664,6 @@ dotraplinkage void __kprobes do_debug(st if (condition & DR_STEP && kmemcheck_trap(regs)) return; - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; @@ -561,6 +681,10 @@ dotraplinkage void __kprobes do_debug(st if (regs->flags & X86_VM_MASK) goto debug_vm86; #endif + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; @@ -747,30 +871,14 @@ do_simd_coprocessor_error(struct pt_regs conditional_sti(regs); #ifdef CONFIG_X86_32 - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->ip); - return; - } - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->flags & X86_VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); - return; - } - current->thread.trap_no = 19; - current->thread.error_code = error_code; - die_if_kernel("cache flush denied", regs, error_code); - force_sig(SIGSEGV, current); + ignore_fpu_irq = 1; #else if (!user_mode(regs) && kernel_math_error(regs, "kernel simd math error", 19)) return; - simd_math_error((void __user *)regs->ip); #endif + + simd_math_error((void __user *)regs->ip); } dotraplinkage void @@ -881,19 +989,37 @@ do_device_not_available(struct pt_regs * } #ifdef CONFIG_X86_32 +/* + * The fixup code for errors in iret jumps to here (iret_exc). It loses + * the original trap number and erorr code. The bogus trap 32 and error + * code 0 are what the vanilla kernel delivers via: + * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) + * + * NOTE: Because of the final "1" in the macro we need to enable interrupts. + * + * In case of a general protection fault in the iret instruction, we + * need to check for a lazy CS update for exec-shield. + */ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { - siginfo_t info; + int ok; + int cpu; + local_irq_enable(); - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_BADSTK; - info.si_addr = NULL; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) - return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + + if (!ok && notify_die(DIE_TRAP, "iret exception", regs, + error_code, 32, SIGSEGV) != NOTIFY_STOP) { + siginfo_t info; + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + do_trap(32, SIGSEGV, "iret exception", regs, error_code, &info); + } } #endif @@ -927,7 +1053,7 @@ void __init trap_init(void) set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(12, &stack_segment); set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); diff -uprN linux-2.6.32/arch/x86/kernel/tsc.c linux-2.6.32.ovz/arch/x86/kernel/tsc.c --- linux-2.6.32/arch/x86/kernel/tsc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/tsc.c 2015-03-10 13:24:07.000000000 -0700 @@ -36,7 +36,15 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; -static int tsc_clocksource_reliable; +/* Intel Sandybridge(6,45) or "E5" processors have some initialization + * issues with the TSC. In order to debug these we may need to know the + * some additional information about the TSC at system boot time. + * This debug output can also be enabled by addind "tsc_init_debug" as a + * kernel parameter. + */ +static int __read_mostly tsc_init_debug = 0; + +int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ @@ -113,6 +121,13 @@ static int __init tsc_setup(char *str) __setup("tsc=", tsc_setup); +static int __init tsc_init_debug_setup(char *str) +{ + tsc_init_debug = 1; + return 1; +} +__setup("tsc_init_debug", tsc_init_debug_setup); + #define MAX_RETRIES 5 #define SMI_TRESHOLD 50000 @@ -287,14 +302,15 @@ static inline int pit_verify_msb(unsigne static inline int pit_expect_msb(unsigned char val, u64 *tscp, unsigned long *deltap) { int count; - u64 tsc = 0; + u64 tsc = 0, prev_tsc = 0; for (count = 0; count < 50000; count++) { if (!pit_verify_msb(val)) break; + prev_tsc = tsc; tsc = get_cycles(); } - *deltap = get_cycles() - tsc; + *deltap = get_cycles() - prev_tsc; *tscp = tsc; /* @@ -308,9 +324,9 @@ static inline int pit_expect_msb(unsigne * How many MSB values do we want to see? We aim for * a maximum error rate of 500ppm (in practice the * real error is much smaller), but refuse to spend - * more than 25ms on it. + * more than 50ms on it. */ -#define MAX_QUICK_PIT_MS 25 +#define MAX_QUICK_PIT_MS 50 #define MAX_QUICK_PIT_ITERATIONS (MAX_QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) static unsigned long quick_pit_calibrate(void) @@ -380,15 +396,12 @@ success: * * As a result, we can depend on there not being * any odd delays anywhere, and the TSC reads are - * reliable (within the error). We also adjust the - * delta to the middle of the error bars, just - * because it looks nicer. + * reliable (within the error). * * kHz = ticks / time-in-seconds / 1000; * kHz = (t2 - t1) / (I * 256 / PIT_TICK_RATE) / 1000 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (I * 256 * 1000) */ - delta += (long)(d2 - d1)/2; delta *= PIT_TICK_RATE; do_div(delta, i*256*1000); printk("Fast TSC calibration using PIT\n"); @@ -618,14 +631,54 @@ static void set_cyc2ns_scale(unsigned lo ns_now = __cycles_2_ns(tsc_now); if (cpu_khz) { - *scale = (NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR)/cpu_khz; - *offset = ns_now - (tsc_now * *scale >> CYC2NS_SCALE_FACTOR); + *scale = ((NSEC_PER_MSEC << CYC2NS_SCALE_FACTOR) + + cpu_khz / 2) / cpu_khz; + *offset = ns_now - mult_frac(tsc_now, *scale, + (1UL << CYC2NS_SCALE_FACTOR)); } sched_clock_idle_wakeup_event(0); local_irq_restore(flags); } +static unsigned long long cyc2ns_suspend; + +void tsc_save_sched_clock_state(void) +{ + if (!sched_clock_stable) + return; + + cyc2ns_suspend = sched_clock(); +} + +/* + * Even on processors with invariant TSC, TSC gets reset in some the + * ACPI system sleep states. And in some systems BIOS seem to reinit TSC to + * arbitrary value (still sync'd across cpu's) during resume from such sleep + * states. To cope up with this, recompute the cyc2ns_offset for each cpu so + * that sched_clock() continues from the point where it was left off during + * suspend. + */ +void tsc_restore_sched_clock_state(void) +{ + unsigned long long offset; + unsigned long flags; + int cpu; + + if (!sched_clock_stable) + return; + + local_irq_save(flags); + + __get_cpu_var(cyc2ns_offset) = 0; + offset = cyc2ns_suspend - sched_clock(); + + for_each_possible_cpu(cpu) + per_cpu(cyc2ns_offset, cpu) = offset; + + local_irq_restore(flags); +} + #ifdef CONFIG_CPU_FREQ /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency @@ -763,6 +816,7 @@ void mark_tsc_unstable(char *reason) { if (!tsc_unstable) { tsc_unstable = 1; + sched_clock_stable = 0; printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); /* Change only the rating, when not registered */ if (clocksource_tsc.mult) @@ -829,6 +883,9 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) return 0; + + if (tsc_clocksource_reliable) + return 0; /* * Intel systems are normally all synchronized. * Exceptions must mark TSC as unstable: @@ -836,14 +893,92 @@ __cpuinit int unsynchronized_tsc(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { /* assume multi socket systems are not synchronized: */ if (num_possible_cpus() > 1) - tsc_unstable = 1; + return 1; } - return tsc_unstable; + return 0; +} + + +static void tsc_refine_calibration_work(struct work_struct *work); +static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work); +/** + * tsc_refine_calibration_work - Further refine tsc freq calibration + * @work - ignored. + * + * This functions uses delayed work over a period of a + * second to further refine the TSC freq value. Since this is + * timer based, instead of loop based, we don't block the boot + * process while this longer calibration is done. + * + * If there are any calibration anomolies (too many SMIs, etc), + * or the refined calibration is off by 1% of the fast early + * calibration, we throw out the new calibration and use the + * early calibration. + */ +static void tsc_refine_calibration_work(struct work_struct *work) +{ + static u64 tsc_start = -1, ref_start; + static int hpet; + u64 tsc_stop, ref_stop, delta; + unsigned long freq; + + /* Don't bother refining TSC on unstable systems */ + if (check_tsc_unstable()) + goto out; + + /* + * Since the work is started early in boot, we may be + * delayed the first time we expire. So set the workqueue + * again once we know timers are working. + */ + if (tsc_start == -1) { + /* + * Only set hpet once, to avoid mixing hardware + * if the hpet becomes enabled later. + */ + hpet = is_hpet_enabled(); + schedule_delayed_work(&tsc_irqwork, HZ); + tsc_start = tsc_read_refs(&ref_start, hpet); + return; + } + + tsc_stop = tsc_read_refs(&ref_stop, hpet); + + /* hpet or pmtimer available ? */ + if (!hpet && !ref_start && !ref_stop) + goto out; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX) + goto out; + + delta = tsc_stop - tsc_start; + delta *= 1000000LL; + if (hpet) + freq = calc_hpet_ref(delta, ref_start, ref_stop); + else + freq = calc_pmtimer_ref(delta, ref_start, ref_stop); + + /* Make sure we're within 1% */ + if (abs(tsc_khz - freq) > tsc_khz/100) + goto out; + + tsc_khz = freq; + printk(KERN_INFO "Refined TSC clocksource calibration: " + "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000, + (unsigned long)tsc_khz % 1000); + +out: + clocksource_register_khz(&clocksource_tsc, tsc_khz); } -static void __init init_tsc_clocksource(void) + +static int __init init_tsc_clocksource(void) { + if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz) + return 0; + clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, clocksource_tsc.shift); if (tsc_clocksource_reliable) @@ -853,62 +988,14 @@ static void __init init_tsc_clocksource( clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register(&clocksource_tsc); + schedule_delayed_work(&tsc_irqwork, 0); + return 0; } - -#ifdef CONFIG_X86_64 /* - * calibrate_cpu is used on systems with fixed rate TSCs to determine - * processor frequency + * We use device_initcall here, to ensure we run after the hpet + * is fully initialized, which may occur at fs_initcall time. */ -#define TICK_COUNT 100000000 -static unsigned long __init calibrate_cpu(void) -{ - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... " - "cpu_khz value may be incorrect.\n"); - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start measuring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); -} -#else -static inline unsigned long calibrate_cpu(void) { return cpu_khz; } -#endif +device_initcall(init_tsc_clocksource); void __init tsc_init(void) { @@ -920,6 +1007,15 @@ void __init tsc_init(void) if (!cpu_has_tsc) return; + if (tsc_init_debug || + ((boot_cpu_data.x86 == 6) && + ((boot_cpu_data.x86_model == 45) || + (boot_cpu_data.x86_model == 62)))) { + printk(KERN_INFO + "TSC: cpu family %d model %d, tsc initial value = %llx\n", + boot_cpu_data.x86, boot_cpu_data.x86_model, + (unsigned long long)get_cycles()); + } tsc_khz = x86_platform.calibrate_tsc(); cpu_khz = tsc_khz; @@ -928,10 +1024,6 @@ void __init tsc_init(void) return; } - if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) && - (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)) - cpu_khz = calibrate_cpu(); - printk("Detected %lu.%03lu MHz processor.\n", (unsigned long)cpu_khz / 1000, (unsigned long)cpu_khz % 1000); @@ -963,6 +1055,25 @@ void __init tsc_init(void) mark_tsc_unstable("TSCs unsynchronized"); check_system_tsc_reliable(); - init_tsc_clocksource(); } +#ifdef CONFIG_SMP +/* + * If we have a constant TSC and are using the TSC for the delay loop, + * we can skip clock calibration if another cpu in the same socket has already + * been calibrated. This assumes that CONSTANT_TSC applies to all + * cpus in the socket - this should be a safe assumption. + */ +unsigned long __cpuinit calibrate_delay_is_known(void) +{ + int i, cpu = smp_processor_id(); + + if (!tsc_disabled && !cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC)) + return 0; + + for_each_online_cpu(i) + if (cpu_data(i).phys_proc_id == cpu_data(cpu).phys_proc_id) + return cpu_data(i).loops_per_jiffy; + return 0; +} +#endif diff -uprN linux-2.6.32/arch/x86/kernel/tsc_sync.c linux-2.6.32.ovz/arch/x86/kernel/tsc_sync.c --- linux-2.6.32/arch/x86/kernel/tsc_sync.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/tsc_sync.c 2015-06-23 04:16:16.000000000 -0700 @@ -113,14 +113,17 @@ void __cpuinit check_tsc_sync_source(int if (unsynchronized_tsc()) return; - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); + if (tsc_clocksource_reliable) { + if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) + pr_info( + "Skipped synchronization checks as TSC is reliable.\n"); return; } - pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); - +#ifdef CONFIG_VE + /* TSC reset. kill whatever might rely on old values */ + VE_TASK_INFO(current)->wakeup_stamp = 0; +#endif /* * Reset it - in case this is a second bootup: */ @@ -142,12 +145,14 @@ void __cpuinit check_tsc_sync_source(int cpu_relax(); if (nr_warps) { - printk("\n"); + pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n", + smp_processor_id(), cpu); pr_warning("Measured %Ld cycles TSC warp between CPUs, " "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { - printk(" passed.\n"); + pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n", + smp_processor_id(), cpu); } /* @@ -171,7 +176,7 @@ void __cpuinit check_tsc_sync_target(voi { int cpus = 2; - if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + if (unsynchronized_tsc() || tsc_clocksource_reliable) return; /* diff -uprN linux-2.6.32/arch/x86/kernel/uv_irq.c linux-2.6.32.ovz/arch/x86/kernel/uv_irq.c --- linux-2.6.32/arch/x86/kernel/uv_irq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/uv_irq.c 2015-03-10 13:23:11.000000000 -0700 @@ -9,10 +9,25 @@ */ #include +#include #include #include #include +#include + +/* MMR offset and pnode of hub sourcing interrupts for a given irq */ +struct uv_irq_2_mmr_pnode{ + struct rb_node list; + unsigned long offset; + int pnode; + int irq; +}; + +static DEFINE_SPINLOCK(uv_irq_lock); +static struct rb_root uv_irq_root; + +static int uv_set_irq_affinity(unsigned int, const struct cpumask *); static void uv_noop(unsigned int irq) { @@ -28,7 +43,7 @@ static void uv_ack_apic(unsigned int irq ack_APIC_irq(); } -struct irq_chip uv_irq_chip = { +static struct irq_chip uv_irq_chip = { .name = "UV-CORE", .startup = uv_noop_ret, .shutdown = uv_noop, @@ -39,25 +54,213 @@ struct irq_chip uv_irq_chip = { .unmask = uv_noop, .eoi = uv_ack_apic, .end = uv_noop, + .set_affinity = uv_set_irq_affinity, }; /* + * Add offset and pnode information of the hub sourcing interrupts to the + * rb tree for a specific irq. + */ +static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade) +{ + struct rb_node **link = &uv_irq_root.rb_node; + struct rb_node *parent = NULL; + struct uv_irq_2_mmr_pnode *n; + struct uv_irq_2_mmr_pnode *e; + unsigned long irqflags; + + n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL, + uv_blade_to_memory_nid(blade)); + if (!n) + return -ENOMEM; + + n->irq = irq; + n->offset = offset; + n->pnode = uv_blade_to_pnode(blade); + spin_lock_irqsave(&uv_irq_lock, irqflags); + /* Find the right place in the rbtree: */ + while (*link) { + parent = *link; + e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list); + + if (unlikely(irq == e->irq)) { + /* irq entry exists */ + e->pnode = uv_blade_to_pnode(blade); + e->offset = offset; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + kfree(n); + return 0; + } + + if (irq < e->irq) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + /* Insert the node into the rbtree. */ + rb_link_node(&n->list, parent, link); + rb_insert_color(&n->list, &uv_irq_root); + + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; +} + +/* Retrieve offset and pnode information from the rb tree for a specific irq */ +int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) +{ + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + + if (e->irq == irq) { + *offset = e->offset; + *pnode = e->pnode; + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return 0; + } + + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); + return -1; +} + +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +static int +arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset, int limit) +{ + const struct cpumask *eligible_cpu = cpumask_of(cpu); + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int err; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + cfg = irq_cfg(irq); + + err = assign_irq_vector(irq, cfg, eligible_cpu); + if (err != 0) + return err; + + if (limit == UV_AFFINITY_CPU) + desc->status |= IRQ_NO_BALANCING; + else + desc->status |= IRQ_MOVE_PCNTXT; + + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = apic->cpu_mask_to_apicid(eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + + BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != + sizeof(unsigned long)); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + entry->mask = 1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} + +static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_cfg *cfg = desc->chip_data; + unsigned int dest; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long mmr_offset; + int mmr_pnode; + + if (set_desc_affinity(desc, mask, &dest)) + return -1; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + + entry->vector = cfg->vector; + entry->delivery_mode = apic->irq_delivery_mode; + entry->dest_mode = apic->irq_dest_mode; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = dest; + + /* Get previously stored MMR and pnode of hub sourcing interrupts */ + if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) + return -1; + + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + if (cfg->move_in_progress) + send_cleanup_vector(cfg); + + return 0; +} + +/* * Set up a mapping of an available irq and vector, and enable the specified * MMR that defines the MSI that is to be sent to the specified CPU when an * interrupt is raised. */ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset) + unsigned long mmr_offset, int limit) { - int irq; - int ret; + int irq, ret; + + irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade)); - irq = create_irq(); if (irq <= 0) return -EBUSY; - ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); - if (ret != irq) + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, + limit); + if (ret == irq) + uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); + else destroy_irq(irq); return ret; @@ -71,9 +274,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq); * * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). */ -void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +void uv_teardown_irq(unsigned int irq) { - arch_disable_uv_irq(mmr_blade, mmr_offset); + struct uv_irq_2_mmr_pnode *e; + struct rb_node *n; + unsigned long irqflags; + + spin_lock_irqsave(&uv_irq_lock, irqflags); + n = uv_irq_root.rb_node; + while (n) { + e = rb_entry(n, struct uv_irq_2_mmr_pnode, list); + if (e->irq == irq) { + arch_disable_uv_irq(e->pnode, e->offset); + rb_erase(n, &uv_irq_root); + kfree(e); + break; + } + if (irq < e->irq) + n = n->rb_left; + else + n = n->rb_right; + } + spin_unlock_irqrestore(&uv_irq_lock, irqflags); destroy_irq(irq); } EXPORT_SYMBOL_GPL(uv_teardown_irq); diff -uprN linux-2.6.32/arch/x86/kernel/uv_sysfs.c linux-2.6.32.ovz/arch/x86/kernel/uv_sysfs.c --- linux-2.6.32/arch/x86/kernel/uv_sysfs.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/uv_sysfs.c 2015-03-10 13:21:31.000000000 -0700 @@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void if (!sgi_uv_kobj) sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); if (!sgi_uv_kobj) { - printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n"); return -EINVAL; } ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + printk(KERN_WARNING "sysfs_create_file partition_id failed\n"); return ret; } ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); if (ret) { - printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + printk(KERN_WARNING "sysfs_create_file coherence_id failed\n"); return ret; } diff -uprN linux-2.6.32/arch/x86/kernel/uv_time.c linux-2.6.32.ovz/arch/x86/kernel/uv_time.c --- linux-2.6.32/arch/x86/kernel/uv_time.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/uv_time.c 2015-03-10 13:24:02.000000000 -0700 @@ -15,7 +15,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * - * Copyright (c) 2009 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009-2013 Silicon Graphics, Inc. All Rights Reserved. * Copyright (c) Dimitri Sivanich */ #include @@ -36,7 +36,7 @@ static void uv_rtc_timer_setup(enum cloc static struct clocksource clocksource_uv = { .name = RTC_NAME, - .rating = 400, + .rating = 299, .read = uv_read_rtc, .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, .shift = 10, @@ -74,7 +74,7 @@ struct uv_rtc_timer_head { */ static struct uv_rtc_timer_head **blade_info __read_mostly; -static int uv_rtc_enable; +static int uv_rtc_evt_enable; /* * Hardware interface routines @@ -88,9 +88,10 @@ static void uv_rtc_send_IPI(int cpu) apicid = cpu_physical_id(cpu); pnode = uv_apicid_to_pnode(apicid); + apicid |= uv_apicid_hibits; val = (1UL << UVH_IPI_INT_SEND_SHFT) | (apicid << UVH_IPI_INT_APIC_ID_SHFT) | - (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); + (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } @@ -98,32 +99,45 @@ static void uv_rtc_send_IPI(int cpu) /* Check for an RTC interrupt pending */ static int uv_intr_pending(int pnode) { - return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & - UVH_EVENT_OCCURRED0_RTC1_MASK; + if (is_uv1_hub()) + return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & + UV1H_EVENT_OCCURRED0_RTC1_MASK; + else if (is_uvx_hub()) + return uv_read_global_mmr64(pnode, UVXH_EVENT_OCCURRED2) & + UVXH_EVENT_OCCURRED2_RTC_1_MASK; + return 0; } /* Setup interrupt and return non-zero if early expiration occurred. */ static int uv_setup_intr(int cpu, u64 expires) { u64 val; + unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits; int pnode = uv_cpu_to_pnode(cpu); uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, UVH_RTC1_INT_CONFIG_M_MASK); uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); - uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, - UVH_EVENT_OCCURRED0_RTC1_MASK); + if (is_uv1_hub()) + uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, + UV1H_EVENT_OCCURRED0_RTC1_MASK); + else + uv_write_global_mmr64(pnode, UVXH_EVENT_OCCURRED2_ALIAS, + UVXH_EVENT_OCCURRED2_RTC_1_MASK); - val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | - ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); + val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | + ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); /* Set configuration */ uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); /* Initialize comparator value */ uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); - return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); + if (uv_read_rtc(NULL) <= expires) + return 0; + + return !uv_intr_pending(pnode); } /* @@ -223,6 +237,7 @@ static int uv_rtc_set_timer(int cpu, u64 next_cpu = head->next_cpu; *t = expires; + /* Will this one be next to go off? */ if (next_cpu < 0 || bcpu == next_cpu || expires < head->cpu[next_cpu].expires) { @@ -231,7 +246,7 @@ static int uv_rtc_set_timer(int cpu, u64 *t = ULLONG_MAX; uv_rtc_find_next_timer(head, pnode); spin_unlock_irqrestore(&head->lock, flags); - return 1; + return -ETIME; } } @@ -244,7 +259,7 @@ static int uv_rtc_set_timer(int cpu, u64 * * Returns 1 if this timer was pending. */ -static int uv_rtc_unset_timer(int cpu) +static int uv_rtc_unset_timer(int cpu, int force) { int pnode = uv_cpu_to_pnode(cpu); int bid = uv_cpu_to_blade_id(cpu); @@ -256,14 +271,15 @@ static int uv_rtc_unset_timer(int cpu) spin_lock_irqsave(&head->lock, flags); - if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) + if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force) rc = 1; - *t = ULLONG_MAX; - - /* Was the hardware setup for this timer? */ - if (head->next_cpu == bcpu) - uv_rtc_find_next_timer(head, pnode); + if (rc) { + *t = ULLONG_MAX; + /* Was the hardware setup for this timer? */ + if (head->next_cpu == bcpu) + uv_rtc_find_next_timer(head, pnode); + } spin_unlock_irqrestore(&head->lock, flags); @@ -277,10 +293,21 @@ static int uv_rtc_unset_timer(int cpu) /* * Read the RTC. + * + * Starting with HUB rev 2.0, the UV RTC register is replicated across all + * cachelines of it's own page. This allows faster simultaneous reads + * from a given socket. */ static cycle_t uv_read_rtc(struct clocksource *cs) { - return (cycle_t)uv_read_local_mmr(UVH_RTC); + unsigned long offset; + + if (uv_get_min_hub_revision_id() == 1) + offset = 0; + else + offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE; + + return (cycle_t)uv_read_local_mmr(UVH_RTC | offset); } /* @@ -310,32 +337,32 @@ static void uv_rtc_timer_setup(enum cloc break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - uv_rtc_unset_timer(ced_cpu); + uv_rtc_unset_timer(ced_cpu, 1); break; } } static void uv_rtc_interrupt(void) { - struct clock_event_device *ced = &__get_cpu_var(cpu_ced); int cpu = smp_processor_id(); + struct clock_event_device *ced = &per_cpu(cpu_ced, cpu); if (!ced || !ced->event_handler) return; - if (uv_rtc_unset_timer(cpu) != 1) + if (uv_rtc_unset_timer(cpu, 0) != 1) return; ced->event_handler(ced); } -static int __init uv_enable_rtc(char *str) +static int __init uv_enable_evt_rtc(char *str) { - uv_rtc_enable = 1; + uv_rtc_evt_enable = 1; return 1; } -__setup("uvrtc", uv_enable_rtc); +__setup("uvrtcevt", uv_enable_evt_rtc); static __init void uv_rtc_register_clockevents(struct work_struct *dummy) { @@ -350,27 +377,28 @@ static __init int uv_rtc_setup_clock(voi { int rc; - if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) + if (!is_uv_system()) return -ENODEV; - generic_interrupt_extension = uv_rtc_interrupt; - clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, clocksource_uv.shift); rc = clocksource_register(&clocksource_uv); - if (rc) { - generic_interrupt_extension = NULL; + if (rc) + printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); + else + printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n", + sn_rtc_cycles_per_second/(unsigned long)1E6); + + if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback) return rc; - } /* Setup and register clockevents */ rc = uv_rtc_allocate_timers(); - if (rc) { - clocksource_unregister(&clocksource_uv); - generic_interrupt_extension = NULL; - return rc; - } + if (rc) + goto error; + + x86_platform_ipi_callback = uv_rtc_interrupt; clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, NSEC_PER_SEC, clock_event_device_uv.shift); @@ -383,11 +411,19 @@ static __init int uv_rtc_setup_clock(voi rc = schedule_on_each_cpu(uv_rtc_register_clockevents); if (rc) { - clocksource_unregister(&clocksource_uv); - generic_interrupt_extension = NULL; + x86_platform_ipi_callback = NULL; uv_rtc_deallocate_timers(); + goto error; } + printk(KERN_INFO "UV RTC clockevents registered\n"); + + return 0; + +error: + clocksource_unregister(&clocksource_uv); + printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc); + return rc; } arch_initcall(uv_rtc_setup_clock); diff -uprN linux-2.6.32/arch/x86/kernel/visws_quirks.c linux-2.6.32.ovz/arch/x86/kernel/visws_quirks.c --- linux-2.6.32/arch/x86/kernel/visws_quirks.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/visws_quirks.c 2015-03-10 13:24:12.000000000 -0700 @@ -508,7 +508,7 @@ static struct irq_chip cobalt_irq_type = */ static unsigned int startup_piix4_master_irq(unsigned int irq) { - init_8259A(0); + legacy_pic->init(0); return startup_cobalt_irq(irq); } @@ -609,7 +609,7 @@ static irqreturn_t piix4_master_intr(int handle_IRQ_event(realirq, desc->action); if (!(desc->status & IRQ_DISABLED)) - enable_8259A_irq(realirq); + legacy_pic->chip->unmask(realirq); return IRQ_HANDLED; @@ -628,6 +628,12 @@ static struct irqaction cascade_action = .name = "cascade", }; +static inline void set_piix4_virtual_irq_type(void) +{ + piix4_virtual_irq_type.shutdown = i8259A_chip.mask; + piix4_virtual_irq_type.enable = i8259A_chip.unmask; + piix4_virtual_irq_type.disable = i8259A_chip.mask; +} void init_VISWS_APIC_irqs(void) { @@ -653,6 +659,7 @@ void init_VISWS_APIC_irqs(void) desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { + set_piix4_virtual_irq_type(); desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { diff -uprN linux-2.6.32/arch/x86/kernel/vm86_32.c linux-2.6.32.ovz/arch/x86/kernel/vm86_32.c --- linux-2.6.32/arch/x86/kernel/vm86_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/vm86_32.c 2015-03-10 13:23:10.000000000 -0700 @@ -172,6 +172,7 @@ static void mark_screen_rdonly(struct mm spinlock_t *ptl; int i; + down_write(&mm->mmap_sem); pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; @@ -179,6 +180,7 @@ static void mark_screen_rdonly(struct mm if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); @@ -189,6 +191,7 @@ static void mark_screen_rdonly(struct mm } pte_unmap_unlock(pte, ptl); out: + up_write(&mm->mmap_sem); flush_tlb(); } @@ -335,9 +338,9 @@ static void do_sys_vm86(struct kernel_vm if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call audit_syscall_exit since we do not exit via the normal paths */ + /*call __audit_syscall_exit since we do not exit via the normal paths */ if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(0), 0); + __audit_syscall_exit(1, 0); __asm__ __volatile__( "movl %0,%%esp\n\t" diff -uprN linux-2.6.32/arch/x86/kernel/vmi_32.c linux-2.6.32.ovz/arch/x86/kernel/vmi_32.c --- linux-2.6.32/arch/x86/kernel/vmi_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/vmi_32.c 2015-03-10 13:21:16.000000000 -0700 @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -266,30 +267,6 @@ static void vmi_nop(void) { } -#ifdef CONFIG_HIGHPTE -static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) -{ - void *va = kmap_atomic(page, type); - - /* - * Internally, the VMI ROM must map virtual addresses to physical - * addresses for processing MMU updates. By the time MMU updates - * are issued, this information is typically already lost. - * Fortunately, the VMI provides a cache of mapping slots for active - * page tables. - * - * We use slot zero for the linear mapping of physical memory, and - * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1. - * - * args: SLOT VA COUNT PFN - */ - BUG_ON(type != KM_PTE0 && type != KM_PTE1); - vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page)); - - return va; -} -#endif - static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); @@ -640,6 +617,12 @@ static inline int __init activate_vmi(vo u64 reloc; const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; + /* + * Prevent page tables from being allocated in highmem, even if + * CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + if (call_vrom_func(vmi_rom, vmi_init) != 0) { printk(KERN_ERR "VMI ROM failed to initialize!"); return 0; @@ -778,10 +761,6 @@ static inline int __init activate_vmi(vo /* Set linear is needed in all cases */ vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); -#ifdef CONFIG_HIGHPTE - if (vmi_ops.set_linear_mapping) - pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte; -#endif /* * These MUST always be patched. Don't support indirect jumps diff -uprN linux-2.6.32/arch/x86/kernel/vmlinux.lds.S linux-2.6.32.ovz/arch/x86/kernel/vmlinux.lds.S --- linux-2.6.32/arch/x86/kernel/vmlinux.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/vmlinux.lds.S 2015-06-23 04:16:16.000000000 -0700 @@ -41,6 +41,32 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) +/* + * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA + * we retain large page mappings for boundaries spanning kernel text, rodata + * and data sections. + * + * However, kernel identity mappings will have different RWX permissions + * to the pages mapping to text and to the pages padding (which are freed) the + * text section. Hence kernel identity mappings will be broken to smaller + * pages. For 64-bit, kernel text and kernel identity mappings are different, + * so we can enable protection checks that come with CONFIG_DEBUG_RODATA, + * as well as retain 2MB large page mappings for kernel text. + */ +#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE); + +#define X64_ALIGN_DEBUG_RODATA_END \ + . = ALIGN(HPAGE_SIZE); \ + __end_rodata_hpage_align = .; + +#else + +#define X64_ALIGN_DEBUG_RODATA_BEGIN +#define X64_ALIGN_DEBUG_RODATA_END + +#endif + PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -90,7 +116,9 @@ SECTIONS EXCEPTION_TABLE(16) :text = 0x9090 + X64_ALIGN_DEBUG_RODATA_BEGIN RO_DATA(PAGE_SIZE) + X64_ALIGN_DEBUG_RODATA_END /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -172,6 +200,11 @@ SECTIONS } jiffies = VVIRT(.jiffies); + .fence_wdog_jiffies64 : AT(VLOAD(.fence_wdog_jiffies64)) { + *(.fence_wdog_jiffies64) + } + fence_wdog_jiffies64 = VVIRT(.fence_wdog_jiffies64); + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } diff -uprN linux-2.6.32/arch/x86/kernel/vsyscall_64.c linux-2.6.32.ovz/arch/x86/kernel/vsyscall_64.c --- linux-2.6.32/arch/x86/kernel/vsyscall_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/vsyscall_64.c 2015-06-23 04:16:16.000000000 -0700 @@ -61,6 +61,7 @@ struct vsyscall_gtod_data __vsyscall_gto { .lock = SEQLOCK_UNLOCKED, .sysctl_enabled = 1, + .gettime_monotonic_enabled = 0, }; void update_vsyscall_tz(void) @@ -73,7 +74,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { unsigned long flags; @@ -82,11 +84,11 @@ void update_vsyscall(struct timespec *wa vsyscall_gtod_data.clock.vread = clock->vread; vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; vsyscall_gtod_data.clock.mask = clock->mask; - vsyscall_gtod_data.clock.mult = clock->mult; + vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -233,6 +235,10 @@ static ctl_table kernel_table2[] = { .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, + { .procname = "vsyscall64_gettime_monotonic", + .data = &vsyscall_gtod_data.gettime_monotonic_enabled, .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec }, {} }; diff -uprN linux-2.6.32/arch/x86/kernel/x8664_ksyms_64.c linux-2.6.32.ovz/arch/x86/kernel/x8664_ksyms_64.c --- linux-2.6.32/arch/x86/kernel/x8664_ksyms_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/x8664_ksyms_64.c 2015-03-10 13:22:35.000000000 -0700 @@ -28,10 +28,32 @@ EXPORT_SYMBOL(__put_user_2); EXPORT_SYMBOL(__put_user_4); EXPORT_SYMBOL(__put_user_8); +__must_check unsigned long +copy_user_generic(void *to, const void *from, unsigned len) +{ + unsigned ret; + + alternative_call(copy_user_generic_unrolled, + copy_user_generic_string, + X86_FEATURE_REP_GOOD, + ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), + "=d" (len)), + "1" (to), "2" (from), "3" (len) + : "memory", "rcx", "r8", "r9", "r10", "r11"); + return ret; +} EXPORT_SYMBOL(copy_user_generic); +EXPORT_SYMBOL(copy_user_generic_string); +EXPORT_SYMBOL(copy_user_generic_unrolled); EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(copy_from_user); EXPORT_SYMBOL(copy_to_user); + +__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, + unsigned size) +{ + return copy_user_generic(dst, (__force const void *)src, size); +} EXPORT_SYMBOL(__copy_from_user_inatomic); EXPORT_SYMBOL(copy_page); @@ -54,6 +76,7 @@ extern void *__memcpy(void *, const void EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); +EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(init_level4_pgt); diff -uprN linux-2.6.32/arch/x86/kernel/x86_init.c linux-2.6.32.ovz/arch/x86/kernel/x86_init.c --- linux-2.6.32/arch/x86/kernel/x86_init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/x86_init.c 2015-03-10 13:23:09.000000000 -0700 @@ -13,6 +13,7 @@ #include #include #include +#include #include void __cpuinit x86_init_noop(void) { } @@ -26,7 +27,7 @@ void __init x86_init_pgd_noop(pgd_t *unu struct x86_init_ops x86_init __initdata = { .resources = { - .probe_roms = x86_init_noop, + .probe_roms = probe_roms, .reserve_resources = reserve_standard_io_resources, .memory_setup = default_machine_specific_memory_setup, }, @@ -65,11 +66,18 @@ struct x86_init_ops x86_init __initdata }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, }; +static void default_nmi_init(void) { }; + struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, + .is_untracked_pat_range = is_ISA_range, + .nmi_init = default_nmi_init, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, }; diff -uprN linux-2.6.32/arch/x86/kernel/xsave.c linux-2.6.32.ovz/arch/x86/kernel/xsave.c --- linux-2.6.32/arch/x86/kernel/xsave.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kernel/xsave.c 2015-03-10 13:23:53.000000000 -0700 @@ -21,6 +21,76 @@ struct _fpx_sw_bytes fx_sw_reserved; struct _fpx_sw_bytes fx_sw_reserved_ia32; #endif +static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; + +/* + * If a processor implementation discern that a processor state component is + * in its initialized state it may modify the corresponding bit in the + * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * layout in the case of xsaveopt. While presenting the xstate information to + * the user, we always ensure that the memory layout of a feature will be in + * the init state if the corresponding header bit is zero. This is to ensure + * that the user doesn't see some stale state in the memory layout during + * signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + u64 xstate_bv; + int feature_bit = 0x2; + struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; + + if (!fx) + return; + + xstate_bv = tsk->thread.xstate->xsave.xsave_hdr.xstate_bv; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is upto date. + */ + if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + return; + + /* + * FP is in init state + */ + if (!(xstate_bv & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xstate_bv & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + + /* + * Update all the other memory layouts for which the corresponding + * header bit is in the init state. + */ + while (xstate_bv) { + if (xstate_bv & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy(((void *) fx) + offset, + ((void *) init_xstate_buf) + offset, + size); + } + + xstate_bv >>= 1; + feature_bit++; + } +} + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -99,7 +169,7 @@ int save_i387_xstate(void __user *buf) if (err) return err; - if (task_thread_info(tsk)->status & TS_XSAVE) + if (use_xsave()) err = xsave_user(buf); else err = fxsave_user(buf); @@ -116,7 +186,7 @@ int save_i387_xstate(void __user *buf) clear_used_math(); /* trigger finit */ - if (task_thread_info(tsk)->status & TS_XSAVE) { + if (use_xsave()) { struct _fpstate __user *fx = buf; struct _xstate __user *x = buf; u64 xstate_bv; @@ -225,7 +295,7 @@ int restore_i387_xstate(void __user *buf clts(); task_thread_info(current)->status |= TS_USEDFPU; } - if (task_thread_info(tsk)->status & TS_XSAVE) + if (use_xsave()) err = restore_user_xstate(buf); else err = fxrstor_checking((__force struct i387_fxsave_struct *) @@ -301,12 +371,55 @@ void __cpuinit xsave_init(void) } /* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xstate_features = fls64(pcntxt_mask); + xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); + xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + + do { + cpuid_count(0xd, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + +/* * setup the xstate image representing the init state */ static void __init setup_xstate_init(void) { + setup_xstate_features(); + + /* + * Setup init_xstate_buf to represent the init state of + * all the features managed by the xsave + */ init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; + + clts(); + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state(init_xstate_buf, -1); + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state(init_xstate_buf, -1); + stts(); } /* @@ -337,6 +450,7 @@ void __ref xsave_cntxt_init(void) cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; + update_regset_xstate_info(xstate_size, pcntxt_mask); prepare_fx_sw_frame(); setup_xstate_init(); diff -uprN linux-2.6.32/arch/x86/kvm/emulate.c linux-2.6.32.ovz/arch/x86/kvm/emulate.c --- linux-2.6.32/arch/x86/kvm/emulate.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/emulate.c 2015-03-10 13:24:05.000000000 -0700 @@ -75,46 +75,57 @@ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ +/* Misc flags */ +#define Lock (1<<26) /* lock prefix is allowed for the instruction */ +#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ +#define No64 (1<<28) /* Source 2 operand type */ #define Src2None (0<<29) #define Src2CL (1<<29) #define Src2ImmByte (2<<29) #define Src2One (3<<29) #define Src2Imm16 (4<<29) +#define Src2Mem16 (5<<29) /* Used for Ep encoding. First argument has to be + in memory and second argument is located + immediately after the first one in memory. */ #define Src2Mask (7<<29) enum { Group1_80, Group1_81, Group1_82, Group1_83, Group1A, Group3_Byte, Group3, Group4, Group5, Group7, + Group8, Group9, }; static u32 opcode_table[256] = { /* 0x00 - 0x07 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x08 - 0x0F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + 0, 0, ImplicitOps | Stack | No64, 0, /* 0x10 - 0x17 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x18 - 0x1F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, /* 0x20 - 0x27 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x30 - 0x37 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 0, 0, 0, 0, /* 0x38 - 0x3F */ @@ -149,7 +160,7 @@ static u32 opcode_table[256] = { Group | Group1_80, Group | Group1_81, Group | Group1_82, Group | Group1_83, ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -158,7 +169,7 @@ static u32 opcode_table[256] = { /* 0x90 - 0x97 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x98 - 0x9F */ - 0, 0, SrcImm | Src2Imm16, 0, + 0, 0, SrcImm | Src2Imm16 | No64, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, @@ -185,7 +196,7 @@ static u32 opcode_table[256] = { ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, /* 0xC8 - 0xCF */ 0, 0, 0, ImplicitOps | Stack, - ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, + ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps, /* 0xD0 - 0xD7 */ ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, @@ -198,12 +209,12 @@ static u32 opcode_table[256] = { ByteOp | SrcImmUByte, SrcImmUByte, /* 0xE8 - 0xEF */ SrcImm | Stack, SrcImm | ImplicitOps, - SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, + SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, - ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, + ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, @@ -211,16 +222,20 @@ static u32 opcode_table[256] = { static u32 twobyte_table[256] = { /* 0x00 - 0x0F */ - 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, + 0, Group | GroupDual | Group7, 0, 0, + 0, ImplicitOps, ImplicitOps | Priv, 0, + ImplicitOps | Priv, ImplicitOps | Priv, 0, 0, + 0, ImplicitOps | ModRM, 0, 0, /* 0x10 - 0x1F */ 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, /* 0x20 - 0x2F */ - ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, + ModRM | ImplicitOps | Priv, ModRM | Priv, + ModRM | ImplicitOps | Priv, ModRM | Priv, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x30 - 0x3F */ - ImplicitOps, 0, ImplicitOps, 0, - ImplicitOps, ImplicitOps, 0, 0, + ImplicitOps | Priv, 0, ImplicitOps | Priv, 0, + ImplicitOps, ImplicitOps | Priv, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 - 0x47 */ DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, @@ -244,25 +259,29 @@ static u32 twobyte_table[256] = { /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xA0 - 0xA7 */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + ImplicitOps | Stack, ImplicitOps | Stack, + 0, DstMem | SrcReg | ModRM | BitOp, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, 0, 0, /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, + ImplicitOps | Stack, ImplicitOps | Stack, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, DstMem | SrcReg | Src2ImmByte | ModRM, DstMem | SrcReg | Src2CL | ModRM, ModRM, 0, /* 0xB0 - 0xB7 */ - ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, - DstMem | SrcReg | ModRM | BitOp, + ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, + 0, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xB8 - 0xBF */ - 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, + 0, 0, + Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock, 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem16 | ModRM | Mov, /* 0xC0 - 0xCF */ - 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, + 0, 0, 0, DstMem | SrcReg | ModRM | Mov, + 0, 0, 0, Group | GroupDual | Group9, 0, 0, 0, 0, 0, 0, 0, 0, /* 0xD0 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -274,61 +293,93 @@ static u32 twobyte_table[256] = { static u32 group_table[] = { [Group1_80*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM | Lock, + ByteOp | DstMem | SrcImm | ModRM, [Group1_81*8] = - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, - DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM | Lock, + DstMem | SrcImm | ModRM, [Group1_82*8] = - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, - ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64 | Lock, + ByteOp | DstMem | SrcImm | ModRM | No64, [Group1_83*8] = - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, - DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM, [Group1A*8] = DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, [Group3_Byte*8] = ByteOp | SrcImm | DstMem | ModRM, 0, - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, [Group3*8] = DstMem | SrcImm | ModRM, 0, - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, [Group4*8] = - ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock, + SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps, SrcMem | ModRM | Stack, 0, - SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = - 0, 0, ModRM | SrcMem, ModRM | SrcMem, + 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv, SrcNone | ModRM | DstMem | Mov, 0, - SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, + SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv, + [Group8*8] = + 0, 0, 0, 0, + DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock, + DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock, + [Group9*8] = + 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0, }; static u32 group2_table[] = { [Group7*8] = - SrcNone | ModRM, 0, 0, SrcNone | ModRM, + SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM, SrcNone | ModRM | DstMem | Mov, 0, SrcMem16 | ModRM | Mov, 0, + [Group9*8] = + 0, 0, 0, 0, 0, 0, 0, 0, }; /* EFLAGS bit definitions. */ +#define EFLG_ID (1<<21) +#define EFLG_VIP (1<<20) +#define EFLG_VIF (1<<19) +#define EFLG_AC (1<<18) #define EFLG_VM (1<<17) #define EFLG_RF (1<<16) +#define EFLG_IOPL (3<<12) +#define EFLG_NT (1<<14) #define EFLG_OF (1<<11) #define EFLG_DF (1<<10) #define EFLG_IF (1<<9) +#define EFLG_TF (1<<8) #define EFLG_SF (1<<7) #define EFLG_ZF (1<<6) #define EFLG_AF (1<<4) @@ -597,29 +648,32 @@ static int do_fetch_insn_byte(struct x86 if (linear < fc->start || linear >= fc->end) { size = min(15UL, PAGE_SIZE - offset_in_page(linear)); - rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); - if (rc) + rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL); + if (rc != X86EMUL_CONTINUE) return rc; fc->start = linear; fc->end = linear + size; } *dest = fc->data[linear - fc->start]; - return 0; + return X86EMUL_CONTINUE; } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, void *dest, unsigned size) { - int rc = 0; + int rc; + /* x86 instructions are limited to 15 bytes. */ + if (eip + size - ctxt->decode.eip_orig > 15) + return X86EMUL_UNHANDLEABLE; eip += ctxt->cs_base; while (size--) { rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; } - return 0; + return X86EMUL_CONTINUE; } /* @@ -649,11 +703,11 @@ static int read_descriptor(struct x86_em op_bytes = 3; *address = 0; rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, - ctxt->vcpu); - if (rc) + ctxt->vcpu, NULL); + if (rc != X86EMUL_CONTINUE) return rc; rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, - ctxt->vcpu); + ctxt->vcpu, NULL); return rc; } @@ -730,7 +784,7 @@ static int decode_modrm(struct x86_emula struct decode_cache *c = &ctxt->decode; u8 sib; int index_reg = 0, base_reg = 0, scale; - int rc = 0; + int rc = X86EMUL_CONTINUE; if (c->rex_prefix) { c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ @@ -843,7 +897,7 @@ static int decode_abs(struct x86_emulate struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; switch (c->ad_bytes) { case 2: @@ -861,22 +915,30 @@ done: } int -x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) +x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, + void *insn, int insn_len) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, group; /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = kvm_rip_read(ctxt->vcpu); + c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu); ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); + if (insn_len > 0) { + c->fetch.start = c->eip; + c->fetch.end = c->fetch.start + insn_len; + memcpy(c->fetch.data, insn, insn_len); + } + switch (mode) { case X86EMUL_MODE_REAL: + case X86EMUL_MODE_VM86: case X86EMUL_MODE_PROT16: def_op_bytes = def_ad_bytes = 2; break; @@ -988,7 +1050,7 @@ done_prefixes: rc = decode_modrm(ctxt, ops); else if (c->d & MemAbs) rc = decode_abs(ctxt, ops); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; if (!c->has_seg_override) @@ -1110,6 +1172,10 @@ done_prefixes: c->src2.bytes = 1; c->src2.val = 1; break; + case Src2Mem16: + c->src2.bytes = 2; + c->src2.type = OP_MEM; + break; } /* Decode and fetch the destination operand: register or memory. */ @@ -1186,16 +1252,81 @@ static int emulate_pop(struct x86_emulat return rc; } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int emulate_popf(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + void *dest, int len) +{ + int rc; + unsigned long val, change_mask; + int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu); + + rc = emulate_pop(ctxt, ops, &val, len); + if (rc != X86EMUL_CONTINUE) + return rc; + + change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + + switch(ctxt->mode) { + case X86EMUL_MODE_PROT64: + case X86EMUL_MODE_PROT32: + case X86EMUL_MODE_PROT16: + if (cpl == 0) + change_mask |= EFLG_IOPL; + if (cpl <= iopl) + change_mask |= EFLG_IF; + break; + case X86EMUL_MODE_VM86: + if (iopl < 3) { + kvm_inject_gp(ctxt->vcpu, 0); + return X86EMUL_PROPAGATE_FAULT; + } + change_mask |= EFLG_IF; + break; + default: /* real mode */ + change_mask |= (EFLG_IOPL | EFLG_IF); + break; + } + + *(unsigned long *)dest = + (ctxt->eflags & ~change_mask) | (val & change_mask); + + return rc; +} + +static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +{ + struct decode_cache *c = &ctxt->decode; + struct kvm_segment segment; + + kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg); + + c->src.val = segment.selector; + emulate_push(ctxt); +} + +static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) { struct decode_cache *c = &ctxt->decode; + unsigned long selector; int rc; - rc = emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); - if (rc != 0) + rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); + if (rc != X86EMUL_CONTINUE) return rc; - return 0; + + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg); + return rc; +} + +static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + struct decode_cache *c = &ctxt->decode; + + return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); } static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) @@ -1231,11 +1362,13 @@ static inline int emulate_grp3(struct x8 struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; - int rc = 0; + int rc = X86EMUL_CONTINUE; switch (c->modrm_reg) { case 0 ... 1: /* test */ emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + /* Disable writeback. */ + c->dst.type = OP_NONE; break; case 2: /* not */ c->dst.val = ~c->dst.val; @@ -1278,7 +1411,7 @@ static inline int emulate_grp45(struct x emulate_push(ctxt); break; } - return 0; + return X86EMUL_CONTINUE; } static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, @@ -1309,7 +1442,7 @@ static inline int emulate_grp9(struct x8 return rc; ctxt->eflags |= EFLG_ZF; } - return 0; + return X86EMUL_CONTINUE; } static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, @@ -1320,14 +1453,14 @@ static int emulate_ret_far(struct x86_em unsigned long cs; rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; if (c->op_bytes == 4) c->eip = (u32)c->eip; rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) return rc; - rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); + rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS); return rc; } @@ -1380,7 +1513,7 @@ static inline int writeback(struct x86_e default: break; } - return 0; + return X86EMUL_CONTINUE; } static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask) @@ -1426,20 +1559,77 @@ setup_syscalls_segments(struct x86_emula ss->present = 1; } +static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops) +{ + u32 eax, ebx, ecx, edx; + + /* + * syscall should always be enabled in longmode - so only become + * vendor specific (cpuid) if other modes are active... + */ + if (ctxt->mode == X86EMUL_MODE_PROT64) + return true; + + eax = 0x00000000; + ecx = 0x00000000; + if (ops->get_cpuid(ctxt->vcpu, &eax, &ebx, &ecx, &edx)) { + /* + * Intel ("GenuineIntel") + * remark: Intel CPUs only support "syscall" in 64bit + * longmode. Also an 64bit guest with a + * 32bit compat-app running will #UD !! While this + * behaviour can be fixed (by emulating) into AMD + * response - CPUs of AMD can't behave like Intel. + */ + if (ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx && + ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx && + edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx) + return false; + + /* AMD ("AuthenticAMD") */ + if (ebx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ebx && + ecx == X86EMUL_CPUID_VENDOR_AuthenticAMD_ecx && + edx == X86EMUL_CPUID_VENDOR_AuthenticAMD_edx) + return true; + + /* AMD ("AMDisbetter!") */ + if (ebx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ebx && + ecx == X86EMUL_CPUID_VENDOR_AMDisbetterI_ecx && + edx == X86EMUL_CPUID_VENDOR_AMDisbetterI_edx) + return true; + } + + /* default: (not Intel, not AMD), apply Intel's stricter rules... */ + return false; +} + static int -emulate_syscall(struct x86_emulate_ctxt *ctxt) +emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) { struct decode_cache *c = &ctxt->decode; struct kvm_segment cs, ss; u64 msr_data; + u64 efer; /* syscall is not available in real mode */ - if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) - return -1; + if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) + return X86EMUL_UNHANDLEABLE; + + if (!(em_syscall_is_enabled(ctxt, ops))) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; + } setup_syscalls_segments(ctxt, &cs, &ss); + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_EFER, &efer); + + if (!(efer & EFER_SCE)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + return X86EMUL_PROPAGATE_FAULT; + } + kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); msr_data >>= 32; cs.selector = (u16)(msr_data & 0xfffc); @@ -1473,7 +1663,7 @@ emulate_syscall(struct x86_emulate_ctxt ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); } - return 0; + return X86EMUL_CONTINUE; } static int @@ -1483,22 +1673,17 @@ emulate_sysenter(struct x86_emulate_ctxt struct kvm_segment cs, ss; u64 msr_data; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL || - !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { + /* inject #GP if in real mode */ + if (ctxt->mode == X86EMUL_MODE_REAL) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ if (ctxt->mode == X86EMUL_MODE_PROT64) - return -1; + return X86EMUL_UNHANDLEABLE; setup_syscalls_segments(ctxt, &cs, &ss); @@ -1507,13 +1692,13 @@ emulate_sysenter(struct x86_emulate_ctxt case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; case X86EMUL_MODE_PROT64: if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } break; } @@ -1538,7 +1723,7 @@ emulate_sysenter(struct x86_emulate_ctxt kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; - return 0; + return X86EMUL_CONTINUE; } static int @@ -1549,21 +1734,11 @@ emulate_sysexit(struct x86_emulate_ctxt u64 msr_data; int usermode; - /* inject #UD if LOCK prefix is used */ - if (c->lock_prefix) - return -1; - - /* inject #GP if in real mode or paging is disabled */ - if (ctxt->mode == X86EMUL_MODE_REAL - || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) { - kvm_inject_gp(ctxt->vcpu, 0); - return -1; - } - - /* sysexit must be called from CPL 0 */ - if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) { + /* inject #GP if in real mode or Virtual 8086 mode */ + if (ctxt->mode == X86EMUL_MODE_REAL || + ctxt->mode == X86EMUL_MODE_VM86) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_UNHANDLEABLE; } setup_syscalls_segments(ctxt, &cs, &ss); @@ -1581,7 +1756,7 @@ emulate_sysexit(struct x86_emulate_ctxt cs.selector = (u16)(msr_data + 16); if ((msr_data & 0xfffc) == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = (u16)(msr_data + 24); break; @@ -1589,7 +1764,7 @@ emulate_sysexit(struct x86_emulate_ctxt cs.selector = (u16)(msr_data + 32); if (msr_data == 0x0) { kvm_inject_gp(ctxt->vcpu, 0); - return -1; + return X86EMUL_PROPAGATE_FAULT; } ss.selector = cs.selector + 8; cs.db = 0; @@ -1605,7 +1780,69 @@ emulate_sysexit(struct x86_emulate_ctxt c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; - return 0; + return X86EMUL_CONTINUE; +} + +static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt) +{ + int iopl; + if (ctxt->mode == X86EMUL_MODE_REAL) + return false; + if (ctxt->mode == X86EMUL_MODE_VM86) + return true; + iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl; +} + +static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + struct kvm_segment tr_seg; + int r; + u16 io_bitmap_ptr; + u8 perm, bit_idx = port & 0x7; + unsigned mask = (1 << len) - 1; + + kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR); + if (tr_seg.unusable) + return false; + if (tr_seg.limit < 103) + return false; + r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, + NULL); + if (r != X86EMUL_CONTINUE) + return false; + if (io_bitmap_ptr + port/8 > tr_seg.limit) + return false; + r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1, + ctxt->vcpu, NULL); + if (r != X86EMUL_CONTINUE) + return false; + if ((perm >> bit_idx) & mask) + return false; + return true; +} + +static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, + u16 port, u16 len) +{ + if (emulator_bad_iopl(ctxt)) + if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) + return false; + return true; +} + +static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) +{ + struct msr_data msr; + + msr.data = data; + msr.index = msr_index; + msr.host_initiated = false; + return kvm_set_msr(ctxt->vcpu, &msr); } int @@ -1617,7 +1854,7 @@ x86_emulate_insn(struct x86_emulate_ctxt struct decode_cache *c = &ctxt->decode; unsigned int port; int io_dir_in; - int rc = 0; + int rc = X86EMUL_CONTINUE; ctxt->interruptibility = 0; @@ -1629,6 +1866,23 @@ x86_emulate_insn(struct x86_emulate_ctxt memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); saved_eip = c->eip; + if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + + /* LOCK prefix is allowed only with some instructions */ + if (c->lock_prefix && !(c->d & Lock)) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; + } + + /* Privileged instruction can be executed only in CPL=0 */ + if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) memop = c->modrm_ea; @@ -1674,6 +1928,17 @@ x86_emulate_insn(struct x86_emulate_ctxt c->src.orig_val = c->src.val; } + if (c->src2.type == OP_MEM) { + c->src2.ptr = (unsigned long *)(memop + c->src.bytes); + c->src2.val = 0; + rc = ops->read_emulated((unsigned long)c->src2.ptr, + &c->src2.val, + c->src2.bytes, + ctxt->vcpu); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if ((c->d & DstMask) == ImplicitOps) goto special_insn; @@ -1707,18 +1972,45 @@ special_insn: add: /* add */ emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); break; + case 0x06: /* push es */ + emulate_push_sreg(ctxt, VCPU_SREG_ES); + break; + case 0x07: /* pop es */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0x08 ... 0x0d: or: /* or */ emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); break; + case 0x0e: /* push cs */ + emulate_push_sreg(ctxt, VCPU_SREG_CS); + break; case 0x10 ... 0x15: adc: /* adc */ emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); break; + case 0x16: /* push ss */ + emulate_push_sreg(ctxt, VCPU_SREG_SS); + break; + case 0x17: /* pop ss */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0x18 ... 0x1d: sbb: /* sbb */ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; + case 0x1e: /* push ds */ + emulate_push_sreg(ctxt, VCPU_SREG_DS); + break; + case 0x1f: /* pop ds */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0x20 ... 0x25: and: /* and */ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); @@ -1747,7 +2039,7 @@ special_insn: case 0x58 ... 0x5f: /* pop reg */ pop_instruction: rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x63: /* movsxd */ @@ -1761,7 +2053,12 @@ special_insn: break; case 0x6c: /* insb */ case 0x6d: /* insw/insd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, 1, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1777,7 +2074,12 @@ special_insn: return 0; case 0x6e: /* outsb */ case 0x6f: /* outsw/outsd */ - if (kvm_emulate_pio_string(ctxt->vcpu, NULL, + if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio_string(ctxt->vcpu, 0, (c->d & ByteOp) ? 1 : c->op_bytes, c->rep_prefix ? @@ -1818,6 +2120,8 @@ special_insn: break; case 0x84 ... 0x85: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); + /* Disable writeback. */ + c->dst.type = OP_NONE; break; case 0x86 ... 0x87: /* xchg */ xchg: @@ -1863,32 +2167,26 @@ special_insn: break; case 0x8e: { /* mov seg, r/m16 */ uint16_t sel; - int type_bits; - int err; sel = c->src.val; - if (c->modrm_reg == VCPU_SREG_SS) - toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); - if (c->modrm_reg <= 5) { - type_bits = (c->modrm_reg == 1) ? 9 : 1; - err = kvm_load_segment_descriptor(ctxt->vcpu, sel, - type_bits, c->modrm_reg); - } else { - printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", - c->modrm); - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_CS || + c->modrm_reg > VCPU_SREG_GS) { + kvm_queue_exception(ctxt->vcpu, UD_VECTOR); + goto done; } - if (err < 0) - goto cannot_emulate; + if (c->modrm_reg == VCPU_SREG_SS) + toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS); + + rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg); c->dst.type = OP_NONE; /* Disable writeback. */ break; } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0x90: /* nop / xchg r8,rax */ @@ -1910,7 +2208,10 @@ special_insn: c->dst.type = OP_REG; c->dst.ptr = (unsigned long *) &ctxt->eflags; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa0 ... 0xa1: /* mov */ c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; c->dst.val = c->src.val; @@ -2017,7 +2318,7 @@ special_insn: break; case 0xcb: /* ret far */ rc = emulate_ret_far(ctxt, ops); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0xd0 ... 0xd1: /* Grp2 */ @@ -2048,11 +2349,10 @@ special_insn: case 0xe9: /* jmp rel */ goto jmp; case 0xea: /* jmp far */ - if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, - VCPU_SREG_CS) < 0) { - DPRINTF("jmp far: Failed to load CS descriptor\n"); - goto cannot_emulate; - } + jump_far: + if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, + VCPU_SREG_CS)) + goto done; c->eip = c->src.val; break; @@ -2070,7 +2370,13 @@ special_insn: case 0xef: /* out (e/r)ax,dx */ port = c->regs[VCPU_REGS_RDX]; io_dir_in = 0; - do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + do_io: + if (!emulator_io_permited(ctxt, ops, port, + (c->d & ByteOp) ? 1 : c->op_bytes)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } + if (kvm_emulate_pio(ctxt->vcpu, io_dir_in, (c->d & ByteOp) ? 1 : c->op_bytes, port) != 0) { c->eip = saved_eip; @@ -2087,7 +2393,7 @@ special_insn: break; case 0xf6 ... 0xf7: /* Grp3 */ rc = emulate_grp3(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; case 0xf8: /* clc */ @@ -2095,13 +2401,21 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xfa: /* cli */ - ctxt->eflags &= ~X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + ctxt->eflags &= ~X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfb: /* sti */ - toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); - ctxt->eflags |= X86_EFLAGS_IF; - c->dst.type = OP_NONE; /* Disable writeback. */ + if (emulator_bad_iopl(ctxt)) + kvm_inject_gp(ctxt->vcpu, 0); + else { + toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); + ctxt->eflags |= X86_EFLAGS_IF; + c->dst.type = OP_NONE; /* Disable writeback. */ + } break; case 0xfc: /* cld */ ctxt->eflags &= ~EFLG_DF; @@ -2111,16 +2425,21 @@ special_insn: ctxt->eflags |= EFLG_DF; c->dst.type = OP_NONE; /* Disable writeback. */ break; - case 0xfe ... 0xff: /* Grp4/Grp5 */ + case 0xfe: /* Grp4 */ + grp45: rc = emulate_grp45(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; break; + case 0xff: /* Grp5 */ + if (c->modrm_reg == 5) + goto jump_far; + goto grp45; } writeback: rc = writeback(ctxt, ops); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; /* Commit shadow register state. */ @@ -2146,7 +2465,7 @@ twobyte_insn: goto cannot_emulate; rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; /* Let the processor re-execute the fixed hypercall */ @@ -2157,7 +2476,7 @@ twobyte_insn: case 2: /* lgdt */ rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; realmode_lgdt(ctxt->vcpu, size, address); /* Disable writeback. */ @@ -2168,7 +2487,7 @@ twobyte_insn: switch (c->modrm_rm) { case 1: rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; break; default: @@ -2178,7 +2497,7 @@ twobyte_insn: rc = read_descriptor(ctxt, ops, c->src.ptr, &size, &address, c->op_bytes); - if (rc) + if (rc != X86EMUL_CONTINUE) goto done; realmode_lidt(ctxt->vcpu, size, address); } @@ -2204,8 +2523,9 @@ twobyte_insn: } break; case 0x05: /* syscall */ - if (emulate_syscall(ctxt) == -1) - goto cannot_emulate; + rc = emulate_syscall(ctxt, ops); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2229,33 +2549,34 @@ twobyte_insn: case 0x21: /* mov from dr to reg */ if (c->modrm_mod != 3) goto cannot_emulate; - rc = emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]); - if (rc) + if (emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm])) goto cannot_emulate; + rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; /* no writeback */ break; case 0x22: /* mov reg, cr */ if (c->modrm_mod != 3) goto cannot_emulate; - realmode_set_cr(ctxt->vcpu, - c->modrm_reg, c->modrm_val, &ctxt->eflags); + if (realmode_set_cr(ctxt->vcpu, + c->modrm_reg, c->modrm_val, &ctxt->eflags)) { + kvm_inject_gp(ctxt->vcpu, 0); + goto done; + } c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ if (c->modrm_mod != 3) goto cannot_emulate; - rc = emulator_set_dr(ctxt, c->modrm_reg, - c->regs[c->modrm_rm]); - if (rc) + if (emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm])) goto cannot_emulate; + rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; /* no writeback */ break; case 0x30: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); - if (rc) { + if (emulator_set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); c->eip = kvm_rip_read(ctxt->vcpu); } @@ -2264,8 +2585,7 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); - if (rc) { + if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { kvm_inject_gp(ctxt->vcpu, 0); c->eip = kvm_rip_read(ctxt->vcpu); } else { @@ -2276,14 +2596,16 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x34: /* sysenter */ - if (emulate_sysenter(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysenter(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; case 0x35: /* sysexit */ - if (emulate_sysexit(ctxt) == -1) - goto cannot_emulate; + rc = emulate_sysexit(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; else goto writeback; break; @@ -2297,6 +2619,14 @@ twobyte_insn: jmp_rel(c, c->src.val); c->dst.type = OP_NONE; break; + case 0xa0: /* push fs */ + emulate_push_sreg(ctxt, VCPU_SREG_FS); + break; + case 0xa1: /* pop fs */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xa3: bt: /* bt */ c->dst.type = OP_NONE; @@ -2308,6 +2638,14 @@ twobyte_insn: case 0xa5: /* shld cl, r, r/m */ emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; + case 0xa8: /* push gs */ + emulate_push_sreg(ctxt, VCPU_SREG_GS); + break; + case 0xa9: /* pop gs */ + rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); + if (rc != X86EMUL_CONTINUE) + goto done; + break; case 0xab: bts: /* bts */ /* only subword offset */ @@ -2378,7 +2716,7 @@ twobyte_insn: break; case 0xc7: /* Grp9 (cmpxchg8b) */ rc = emulate_grp9(ctxt, ops, memop); - if (rc != 0) + if (rc != X86EMUL_CONTINUE) goto done; c->dst.type = OP_NONE; break; diff -uprN linux-2.6.32/arch/x86/kvm/i8254.c linux-2.6.32.ovz/arch/x86/kvm/i8254.c --- linux-2.6.32/arch/x86/kvm/i8254.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/i8254.c 2015-03-10 13:24:15.000000000 -0700 @@ -30,6 +30,7 @@ */ #include +#include #include "irq.h" #include "i8254.h" @@ -240,9 +241,20 @@ static void kvm_pit_ack_irq(struct kvm_i { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); + int value; + spin_lock(&ps->inject_lock); - if (atomic_dec_return(&ps->pit_timer.pending) < 0) + value = atomic_dec_return(&ps->pit_timer.pending); + if (value < 0) + /* spurious acks can be generated if, for example, the + * PIC is being reset. Handle it gracefully here + */ atomic_inc(&ps->pit_timer.pending); + else if (value > 0) + /* in this case, we had multiple outstanding pit interrupts + * that we needed to inject. Reinject + */ + queue_work(ps->pit->wq, &ps->pit->expired); ps->irq_ack = 1; spin_unlock(&ps->inject_lock); } @@ -255,15 +267,17 @@ void __kvm_migrate_pit_timer(struct kvm_ if (!kvm_vcpu_is_bsp(vcpu) || !pit) return; + mutex_lock(&pit->pit_state.lock); timer = &pit->pit_state.pit_timer.timer; if (hrtimer_cancel(timer)) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + mutex_unlock(&pit->pit_state.lock); } -static void destroy_pit_timer(struct kvm_timer *pt) +static void destroy_pit_timer(struct kvm_pit *pit) { - pr_debug("pit: execute del timer!\n"); - hrtimer_cancel(&pt->timer); + hrtimer_cancel(&pit->pit_state.pit_timer.timer); + cancel_work_sync(&pit->expired); } static bool kpit_is_periodic(struct kvm_timer *ktimer) @@ -277,24 +291,82 @@ static struct kvm_timer_ops kpit_ops = { .is_periodic = kpit_is_periodic, }; -static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) +static void pit_do_work(struct work_struct *work) +{ + struct kvm_pit *pit = container_of(work, struct kvm_pit, expired); + struct kvm *kvm = pit->kvm; + struct kvm_vcpu *vcpu; + int i; + struct kvm_kpit_state *ps = &pit->pit_state; + int inject = 0; + + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; + } + spin_unlock(&ps->inject_lock); + if (inject) { + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); + kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); + + /* + * Provides NMI watchdog support via Virtual Wire mode. + * The route is: PIT -> PIC -> LVT0 in NMI mode. + * + * Note: Our Virtual Wire implementation is simplified, only + * propagating PIT interrupts to all VCPUs when they have set + * LVT0 to NMI delivery. Other PIC interrupts are just sent to + * VCPU0, and only if its LVT0 is in EXTINT mode. + */ + if (kvm->arch.vapics_in_nmi_mode > 0) + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_apic_nmi_wd_deliver(vcpu); + } +} + +static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { + struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer); + struct kvm_pit *pt = ktimer->kvm->arch.vpit; + + if (ktimer->reinject || !atomic_read(&ktimer->pending)) { + atomic_inc(&ktimer->pending); + queue_work(pt->wq, &pt->expired); + } + + if (ktimer->t_ops->is_periodic(ktimer)) { + hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); + return HRTIMER_RESTART; + } else + return HRTIMER_NORESTART; +} + +static void create_pit_timer(struct kvm *kvm, u32 val, int is_period) +{ + struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state; struct kvm_timer *pt = &ps->pit_timer; s64 interval; + if (!irqchip_in_kernel(kvm)) + return; + interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); /* TODO The new value only affected after the retriggered */ hrtimer_cancel(&pt->timer); + cancel_work_sync(&ps->pit->expired); pt->period = interval; ps->is_periodic = is_period; - pt->timer.function = kvm_timer_fn; + pt->timer.function = pit_timer_fn; pt->t_ops = &kpit_ops; pt->kvm = ps->pit->kvm; - pt->vcpu = pt->kvm->bsp_vcpu; atomic_set(&pt->pending, 0); ps->irq_ack = 1; @@ -333,17 +405,17 @@ static void pit_load_count(struct kvm *k /* FIXME: enhance mode 4 precision */ case 4: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)) { - create_pit_timer(ps, val, 0); + create_pit_timer(kvm, val, 0); } break; case 2: case 3: if (!(ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)){ - create_pit_timer(ps, val, 1); + create_pit_timer(kvm, val, 1); } break; default: - destroy_pit_timer(&ps->pit_timer); + destroy_pit_timer(kvm->arch.vpit); } } @@ -465,6 +537,9 @@ static int pit_ioport_read(struct kvm_io return -EOPNOTSUPP; addr &= KVM_PIT_CHANNEL_MASK; + if (addr == 3) + return 0; + s = &pit_state->channels[addr]; mutex_lock(&pit_state->lock); @@ -600,7 +675,7 @@ static const struct kvm_io_device_ops sp .write = speaker_ioport_write, }; -/* Caller must have writers lock on slots_lock */ +/* Caller must hold slots_lock */ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) { struct kvm_pit *pit; @@ -621,6 +696,13 @@ struct kvm_pit *kvm_create_pit(struct kv mutex_lock(&pit->pit_state.lock); spin_lock_init(&pit->pit_state.inject_lock); + pit->wq = create_singlethread_workqueue("kvm-pit-wq"); + if (!pit->wq) { + kfree(pit); + return NULL; + } + INIT_WORK(&pit->expired, pit_do_work); + kvm->arch.vpit = pit; pit->kvm = kvm; @@ -640,13 +722,13 @@ struct kvm_pit *kvm_create_pit(struct kv kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); - ret = __kvm_io_bus_register_dev(&kvm->pio_bus, + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); if (ret < 0) goto fail_unregister; @@ -655,7 +737,7 @@ struct kvm_pit *kvm_create_pit(struct kv return pit; fail_unregister: - __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev); fail: if (pit->irq_source_id >= 0) @@ -670,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm) struct hrtimer *timer; if (kvm->arch.vpit) { + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &kvm->arch.vpit->speaker_dev); kvm_unregister_irq_mask_notifier(kvm, 0, &kvm->arch.vpit->mask_notifier); kvm_unregister_irq_ack_notifier(kvm, @@ -677,56 +762,10 @@ void kvm_free_pit(struct kvm *kvm) mutex_lock(&kvm->arch.vpit->pit_state.lock); timer = &kvm->arch.vpit->pit_state.pit_timer.timer; hrtimer_cancel(timer); + cancel_work_sync(&kvm->arch.vpit->expired); kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id); mutex_unlock(&kvm->arch.vpit->pit_state.lock); + destroy_workqueue(kvm->arch.vpit->wq); kfree(kvm->arch.vpit); } } - -static void __inject_pit_timer_intr(struct kvm *kvm) -{ - struct kvm_vcpu *vcpu; - int i; - - mutex_lock(&kvm->irq_lock); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); - kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); - mutex_unlock(&kvm->irq_lock); - - /* - * Provides NMI watchdog support via Virtual Wire mode. - * The route is: PIT -> PIC -> LVT0 in NMI mode. - * - * Note: Our Virtual Wire implementation is simplified, only - * propagating PIT interrupts to all VCPUs when they have set - * LVT0 to NMI delivery. Other PIC interrupts are just sent to - * VCPU0, and only if its LVT0 is in EXTINT mode. - */ - if (kvm->arch.vapics_in_nmi_mode > 0) - kvm_for_each_vcpu(i, vcpu, kvm) - kvm_apic_nmi_wd_deliver(vcpu); -} - -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) -{ - struct kvm_pit *pit = vcpu->kvm->arch.vpit; - struct kvm *kvm = vcpu->kvm; - struct kvm_kpit_state *ps; - - if (pit) { - int inject = 0; - ps = &pit->pit_state; - - /* Try to inject pending interrupts when - * last one has been acked. - */ - spin_lock(&ps->inject_lock); - if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { - ps->irq_ack = 0; - inject = 1; - } - spin_unlock(&ps->inject_lock); - if (inject) - __inject_pit_timer_intr(kvm); - } -} diff -uprN linux-2.6.32/arch/x86/kvm/i8254.h linux-2.6.32.ovz/arch/x86/kvm/i8254.h --- linux-2.6.32/arch/x86/kvm/i8254.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/i8254.h 2015-03-10 13:21:36.000000000 -0700 @@ -40,6 +40,8 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; + struct workqueue_struct *wq; + struct work_struct expired; }; #define KVM_PIT_BASE_ADDRESS 0x40 diff -uprN linux-2.6.32/arch/x86/kvm/i8259.c linux-2.6.32.ovz/arch/x86/kvm/i8259.c --- linux-2.6.32/arch/x86/kvm/i8259.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/i8259.c 2015-03-10 13:23:44.000000000 -0700 @@ -32,22 +32,62 @@ #include #include "trace.h" +static void pic_lock(struct kvm_pic *s) + __acquires(&s->lock) +{ + spin_lock(&s->lock); +} + +static void pic_unlock(struct kvm_pic *s) + __releases(&s->lock) +{ + bool wakeup = s->wakeup_needed; + struct kvm_vcpu *vcpu, *found = NULL; + int i; + + s->wakeup_needed = false; + + spin_unlock(&s->lock); + + if (wakeup) { + kvm_for_each_vcpu(i, vcpu, s->kvm) { + if (kvm_apic_accept_pic_intr(vcpu)) { + found = vcpu; + break; + } + } + + if (!found) + found = s->kvm->bsp_vcpu; + + kvm_vcpu_kick(found); + } +} + static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); s->isr_ack |= (1 << irq); if (s != &s->pics_state->pics[0]) irq += 8; + /* + * We are dropping lock while calling ack notifiers since ack + * notifier callbacks for assigned devices call into PIC recursively. + * Other interrupt may be delivered to PIC while lock is dropped but + * it should be safe since PIC state is already updated at this stage. + */ + pic_unlock(s->pics_state); kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); + pic_lock(s->pics_state); } void kvm_pic_clear_isr_ack(struct kvm *kvm) { struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + pic_lock(s); s->pics[0].isr_ack = 0xff; s->pics[1].isr_ack = 0xff; - spin_unlock(&s->lock); + pic_unlock(s); } /* @@ -148,44 +188,57 @@ static void pic_update_irq(struct kvm_pi void kvm_pic_update_irq(struct kvm_pic *s) { - spin_lock(&s->lock); + pic_lock(s); pic_update_irq(s); - spin_unlock(&s->lock); + pic_unlock(s); } -int kvm_pic_set_irq(void *opaque, int irq, int level) +int kvm_pic_set_irq(struct kvm_pic *s, int irq, int irq_source_id, int level) { - struct kvm_pic *s = opaque; int ret = -1; - spin_lock(&s->lock); + pic_lock(s); if (irq >= 0 && irq < PIC_NUM_PINS) { - ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + int irq_level = __kvm_irq_line_state(&s->irq_states[irq], + irq_source_id, level); + ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, irq_level); pic_update_irq(s); trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, s->pics[irq >> 3].imr, ret == 0); } - spin_unlock(&s->lock); + pic_unlock(s); return ret; } +void kvm_pic_clear_all(struct kvm_pic *s, int irq_source_id) +{ + int i; + + pic_lock(s); + for (i = 0; i < PIC_NUM_PINS; i++) + __clear_bit(irq_source_id, &s->irq_states[i]); + pic_unlock(s); +} + /* * acknowledge interrupt 'irq' */ static inline void pic_intack(struct kvm_kpic_state *s, int irq) { s->isr |= 1 << irq; - if (s->auto_eoi) { - if (s->rotate_on_auto_eoi) - s->priority_add = (irq + 1) & 7; - pic_clear_isr(s, irq); - } /* * We don't clear a level sensitive interrupt here */ if (!(s->elcr & (1 << irq))) s->irr &= ~(1 << irq); + + if (s->auto_eoi) { + if (s->rotate_on_auto_eoi) + s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); + } + } int kvm_pic_read_irq(struct kvm *kvm) @@ -193,7 +246,7 @@ int kvm_pic_read_irq(struct kvm *kvm) int irq, irq2, intno; struct kvm_pic *s = pic_irqchip(kvm); - spin_lock(&s->lock); + pic_lock(s); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { pic_intack(&s->pics[0], irq); @@ -218,29 +271,18 @@ int kvm_pic_read_irq(struct kvm *kvm) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); - spin_unlock(&s->lock); + pic_unlock(s); return intno; } void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq, irqbase, n; + int irq; struct kvm *kvm = s->pics_state->irq_request_opaque; struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; + u8 irr = s->irr, isr = s->imr; - if (s == &s->pics_state->pics[0]) - irqbase = 0; - else - irqbase = 8; - - for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { - if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) - if (s->irr & (1 << irq) || s->isr & (1 << irq)) { - n = irq + irqbase; - kvm_notify_acked_irq(kvm, SELECT_PIC(n), n); - } - } s->last_irr = 0; s->irr = 0; s->imr = 0; @@ -256,6 +298,13 @@ void kvm_pic_reset(struct kvm_kpic_state s->rotate_on_auto_eoi = 0; s->special_fully_nested_mode = 0; s->init4 = 0; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { + if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) + if (irr & (1 << irq) || isr & (1 << irq)) { + pic_clear_isr(s, irq); + } + } } static void pic_ioport_write(void *opaque, u32 addr, u32 val) @@ -298,9 +347,9 @@ static void pic_ioport_write(void *opaqu priority = get_priority(s, s->isr); if (priority != 8) { irq = (priority + s->priority_add) & 7; - pic_clear_isr(s, irq); if (cmd == 5) s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); } break; @@ -325,10 +374,20 @@ static void pic_ioport_write(void *opaqu } } else switch (s->init_state) { - case 0: /* normal mode */ + case 0: { /* normal mode */ + u8 imr_diff = s->imr ^ val, + off = (s == &s->pics_state->pics[0]) ? 0 : 8; s->imr = val; + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) + if (imr_diff & (1 << irq)) + kvm_fire_mask_notifiers( + s->pics_state->kvm, + SELECT_PIC(irq + off), + irq + off, + !!(s->imr & (1 << irq))); pic_update_irq(s->pics_state); break; + } case 1: s->irq_base = val & 0xf8; s->init_state = 2; @@ -436,7 +495,7 @@ static int picdev_write(struct kvm_io_de printk(KERN_ERR "PIC: non byte write\n"); return 0; } - spin_lock(&s->lock); + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -449,7 +508,7 @@ static int picdev_write(struct kvm_io_de elcr_ioport_write(&s->pics[addr & 1], addr, data); break; } - spin_unlock(&s->lock); + pic_unlock(s); return 0; } @@ -466,7 +525,7 @@ static int picdev_read(struct kvm_io_dev printk(KERN_ERR "PIC: non byte read\n"); return 0; } - spin_lock(&s->lock); + pic_lock(s); switch (addr) { case 0x20: case 0x21: @@ -480,7 +539,7 @@ static int picdev_read(struct kvm_io_dev break; } *(unsigned char *)val = data; - spin_unlock(&s->lock); + pic_unlock(s); return 0; } @@ -497,7 +556,7 @@ static void pic_irq_request(void *opaque s->output = level; if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { s->pics[0].isr_ack &= ~(1 << irq); - kvm_vcpu_kick(vcpu); + s->wakeup_needed = true; } } @@ -522,12 +581,16 @@ struct kvm_pic *kvm_create_pic(struct kv s->irq_request_opaque = kvm; s->pics[0].pics_state = s; s->pics[1].pics_state = s; + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; /* * Initialize PIO device */ kvm_iodevice_init(&s->dev, &picdev_ops); - ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); + mutex_lock(&kvm->slots_lock); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + mutex_unlock(&kvm->slots_lock); if (ret < 0) { kfree(s); return NULL; diff -uprN linux-2.6.32/arch/x86/kvm/irq.c linux-2.6.32.ovz/arch/x86/kvm/irq.c --- linux-2.6.32/arch/x86/kvm/irq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/irq.c 2015-03-10 13:21:36.000000000 -0700 @@ -89,7 +89,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt) void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu) { kvm_inject_apic_timer_irqs(vcpu); - kvm_inject_pit_timer_irqs(vcpu); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); diff -uprN linux-2.6.32/arch/x86/kvm/irq.h linux-2.6.32.ovz/arch/x86/kvm/irq.h --- linux-2.6.32/arch/x86/kvm/irq.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/irq.h 2015-03-10 13:21:47.000000000 -0700 @@ -45,7 +45,6 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ - u8 isr_ack; /* interrupt ack detection */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -58,11 +57,13 @@ struct kvm_kpic_state { u8 init4; /* true if 4 byte init */ u8 elcr; /* PIIX edge/trigger selection */ u8 elcr_mask; + u8 isr_ack; /* interrupt ack detection */ struct kvm_pic *pics_state; }; struct kvm_pic { spinlock_t lock; + bool wakeup_needed; unsigned pending_acks; struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ @@ -71,6 +72,7 @@ struct kvm_pic { int output; /* intr from master PIC */ struct kvm_io_device dev; void (*ack_notifier)(void *opaque, int irq); + unsigned long irq_states[16]; }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); @@ -85,7 +87,11 @@ static inline struct kvm_pic *pic_irqchi static inline int irqchip_in_kernel(struct kvm *kvm) { - return pic_irqchip(kvm) != NULL; + int ret; + + ret = (pic_irqchip(kvm) != NULL); + smp_rmb(); + return ret; } void kvm_pic_reset(struct kvm_kpic_state *s); diff -uprN linux-2.6.32/arch/x86/kvm/Kconfig linux-2.6.32.ovz/arch/x86/kvm/Kconfig --- linux-2.6.32/arch/x86/kvm/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/Kconfig 2015-03-10 13:24:00.000000000 -0700 @@ -25,9 +25,13 @@ config KVM select PREEMPT_NOTIFIERS select MMU_NOTIFIER select ANON_INODES + select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_IRQCHIP select HAVE_KVM_EVENTFD select KVM_APIC_ARCHITECTURE + select USER_RETURN_NOTIFIER + select PERF_EVENTS + select TASK_DELAY_ACCT ---help--- Support hosting fully virtualized guest machines using hardware virtualization extensions. You will need a fairly recent @@ -64,6 +68,7 @@ config KVM_AMD # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. +source drivers/vhost/Kconfig source drivers/lguest/Kconfig source drivers/virtio/Kconfig diff -uprN linux-2.6.32/arch/x86/kvm/kvm_cache_regs.h linux-2.6.32.ovz/arch/x86/kvm/kvm_cache_regs.h --- linux-2.6.32/arch/x86/kvm/kvm_cache_regs.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/kvm_cache_regs.h 2015-03-10 13:23:24.000000000 -0700 @@ -38,4 +38,34 @@ static inline u64 kvm_pdptr_read(struct return vcpu->arch.pdptrs[index]; } +static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + if (mask & vcpu->arch.cr0_guest_owned_bits) + kvm_x86_ops->decache_cr0_guest_bits(vcpu); + return vcpu->arch.cr0 & mask; +} + +static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr0_bits(vcpu, ~0UL); +} + +static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu) +{ + return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u) + | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); +} + +static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) +{ + if (mask & vcpu->arch.cr4_guest_owned_bits) + kvm_x86_ops->decache_cr4_guest_bits(vcpu); + return vcpu->arch.cr4 & mask; +} + +static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) +{ + return kvm_read_cr4_bits(vcpu, ~0UL); +} + #endif diff -uprN linux-2.6.32/arch/x86/kvm/kvm_timer.h linux-2.6.32.ovz/arch/x86/kvm/kvm_timer.h --- linux-2.6.32/arch/x86/kvm/kvm_timer.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/kvm_timer.h 2015-03-10 13:23:26.000000000 -0700 @@ -2,6 +2,8 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + u32 timer_mode_mask; + u64 tscdeadline; atomic_t pending; /* accumulated triggered timers */ bool reinject; struct kvm_timer_ops *t_ops; diff -uprN linux-2.6.32/arch/x86/kvm/lapic.c linux-2.6.32.ovz/arch/x86/kvm/lapic.c --- linux-2.6.32/arch/x86/kvm/lapic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/lapic.c 2015-03-10 13:24:15.000000000 -0700 @@ -67,6 +67,9 @@ #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) +static unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); + static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) { return *((u32 *) (apic->regs + reg_off)); @@ -87,6 +90,11 @@ static inline int apic_test_and_clear_ve return test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int apic_test_vector(int vec, void *bitmap) +{ + return test_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline void apic_set_vector(int vec, void *bitmap) { set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); @@ -97,6 +105,16 @@ static inline void apic_clear_vector(int clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); } +static inline int __apic_test_and_set_vector(int vec, void *bitmap) +{ + return __test_and_set_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + +static inline int __apic_test_and_clear_vector(int vec, void *bitmap) +{ + return __test_and_clear_bit(VEC_POS(vec), (bitmap) + REG_POS(vec)); +} + static inline int apic_hw_enabled(struct kvm_lapic *apic) { return (apic)->vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE; @@ -134,9 +152,23 @@ static inline int apic_lvt_vector(struct return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; } +static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); +} + static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); +} + +static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == + APIC_LVT_TIMER_TSCDEADLINE); } static inline int apic_lvt_nmi_mode(u32 lvt_val) @@ -165,7 +197,7 @@ static inline int apic_x2apic_mode(struc } static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { - LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ LINT_MASK, LINT_MASK, /* LVT0-1 */ @@ -186,6 +218,16 @@ static int find_highest_vector(void *bit return fls(word[word_offset << 2]) - 1 + (word_offset << 5); } +static u8 count_vectors(void *bitmap) +{ + u32 *word = bitmap; + int word_offset; + u8 count = 0; + for (word_offset = 0; word_offset < MAX_APIC_VECTOR >> 5; ++word_offset) + count += hweight32(word[word_offset << 2]); + return count; +} + static inline int apic_test_and_set_irr(int vec, struct kvm_lapic *apic) { apic->irr_pending = true; @@ -218,6 +260,27 @@ static inline void apic_clear_irr(int ve apic->irr_pending = true; } +static inline void apic_set_isr(int vec, struct kvm_lapic *apic) +{ + if (!__apic_test_and_set_vector(vec, apic->regs + APIC_ISR)) + ++apic->isr_count; + BUG_ON(apic->isr_count > MAX_APIC_VECTOR); + /* + * ISR (in service register) bit is set when injecting an interrupt. + * The highest vector is injected. Thus the latest bit set matches + * the highest bit in ISR. + */ + apic->highest_isr_cache = vec; +} + +static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) +{ + if (__apic_test_and_clear_vector(vec, apic->regs + APIC_ISR)) + --apic->isr_count; + BUG_ON(apic->isr_count < 0); + apic->highest_isr_cache = -1; +} + int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -240,15 +303,90 @@ static int __apic_accept_irq(struct kvm_ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq) { + int result; struct kvm_lapic *apic = vcpu->arch.apic; - return __apic_accept_irq(apic, irq->delivery_mode, irq->vector, + result = __apic_accept_irq(apic, irq->delivery_mode, irq->vector, irq->level, irq->trig_mode); + + if (!kvm_vcpu_is_bsp(apic->vcpu) && !irq->trig_mode) + result = 0; + + return result; +} + +static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) +{ + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); +} + +static int pv_eoi_put_user_atomic(struct kvm_vcpu *vcpu, u8 val) +{ + return kvm_write_guest_cached_atomic(vcpu->kvm, + &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); +} + +static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) +{ + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); +} + +static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pv_eoi.msr_val & KVM_MSR_ENABLED; +} + +static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) +{ + u8 val; + if (pv_eoi_get_user(vcpu, &val) < 0) + apic_debug("Can't read EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return val & 0x1; +} + +static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) +{ + /* + * In RHEL6 only, this can get called in atomic context, so use _atomic + * to avoid schedule in atomic error. + * If the atomic access fails nothing was written into guest memory + * (it's a single byte write so no partial failures are possible). + * This means we did not set KVM_PV_EOI_ENABLED: guest will detect this + * by executing test-and-clear on the faulting memory + * and fallback to (slower) non-PV EOI notification handled in + * apic_set_eoi(). The detection in guest will cause this memory to be + * faulted in, and PV EOI will get re-enabled on the next interrupt. + */ + if (pv_eoi_put_user_atomic(vcpu, KVM_PV_EOI_ENABLED) < 0) + apic_debug("Can't set EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + else + __set_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); +} + +static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) +{ + if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { + apic_debug("Can't clear EOI MSR value: 0x%llx\n", + (unsigned long long)vcpi->arch.pv_eoi.msr_val); + return; + } + __clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention); } static inline int apic_find_highest_isr(struct kvm_lapic *apic) { int result; + if (!apic->isr_count) + return -1; + if (likely(apic->highest_isr_cache != -1)) + return apic->highest_isr_cache; result = find_highest_vector(apic->regs + APIC_ISR); ASSERT(result == -1 || result >= 16); @@ -374,6 +512,12 @@ static int __apic_accept_irq(struct kvm_ if (unlikely(!apic_enabled(apic))) break; + if (trig_mode) { + apic_debug("level trig mode for vector %d", vector); + apic_set_vector(vector, apic->regs + APIC_TMR); + } else + apic_clear_vector(vector, apic->regs + APIC_TMR); + result = !apic_test_and_set_irr(vector, apic); trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector, !result); @@ -384,11 +528,6 @@ static int __apic_accept_irq(struct kvm_ break; } - if (trig_mode) { - apic_debug("level trig mode for vector %d", vector); - apic_set_vector(vector, apic->regs + APIC_TMR); - } else - apic_clear_vector(vector, apic->regs + APIC_TMR); kvm_vcpu_kick(vcpu); break; @@ -453,29 +592,32 @@ int kvm_apic_compare_prio(struct kvm_vcp return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio; } -static void apic_set_eoi(struct kvm_lapic *apic) +static int apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - int trigger_mode; + + trace_kvm_eoi(apic, vector); + /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC */ if (vector == -1) - return; + return vector; - apic_clear_vector(vector, apic->regs + APIC_ISR); + apic_clear_isr(vector, apic); apic_update_ppr(apic); - if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - trigger_mode = IOAPIC_LEVEL_TRIG; - else - trigger_mode = IOAPIC_EDGE_TRIG; - if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { - mutex_lock(&apic->vcpu->kvm->irq_lock); + if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) && + kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) { + int trigger_mode; + if (apic_test_vector(vector, apic->regs + APIC_TMR)) + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); - mutex_unlock(&apic->vcpu->kvm->irq_lock); } + return vector; } static void apic_send_ipi(struct kvm_lapic *apic) @@ -504,9 +646,7 @@ static void apic_send_ipi(struct kvm_lap irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, irq.vector); - mutex_lock(&apic->vcpu->kvm->irq_lock); kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); - mutex_unlock(&apic->vcpu->kvm->irq_lock); } static u32 apic_get_tmcct(struct kvm_lapic *apic) @@ -518,7 +658,8 @@ static u32 apic_get_tmcct(struct kvm_lap ASSERT(apic != NULL); /* if initial count is 0, current count should also be 0 */ - if (apic_get_reg(apic, APIC_TMICT) == 0) + if (apic_get_reg(apic, APIC_TMICT) == 0 || + apic->lapic_timer.period == 0) return 0; remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); @@ -568,6 +709,9 @@ static u32 __apic_read(struct kvm_lapic break; case APIC_TMCCT: /* Timer CCR */ + if (apic_lvtt_tscdeadline(apic)) + return 0; + val = apic_get_tmcct(apic); break; @@ -662,29 +806,40 @@ static void update_divide_count(struct k static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now = apic->lapic_timer.timer.base->get_time(); - - apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic->divide_count; + ktime_t now; atomic_set(&apic->lapic_timer.pending, 0); - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - if (apic->lapic_timer.period < NSEC_PER_MSEC/2) - apic->lapic_timer.period = NSEC_PER_MSEC/2; - } + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + /* lapic timer in oneshot or peroidic mode */ + now = apic->lapic_timer.timer.base->get_time(); + apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, apic->lapic_timer.period), + HRTIMER_MODE_ABS); + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " "timer initial count 0x%x, period %lldns, " "expire @ 0x%016" PRIx64 ".\n", __func__, @@ -693,6 +848,30 @@ static void start_apic_timer(struct kvm_ apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); + } else if (apic_lvtt_tscdeadline(apic)) { + /* lapic timer in tsc deadline mode */ + u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + u64 ns = 0; + struct kvm_vcpu *vcpu = apic->vcpu; + unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long flags; + + if (unlikely(!tscdeadline || !this_tsc_khz)) + return; + + local_irq_save(flags); + + now = apic->lapic_timer.timer.base->get_time(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &guest_tsc); + if (likely(tscdeadline > guest_tsc)) { + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + } + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + + local_irq_restore(flags); + } } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -780,7 +959,6 @@ static int apic_reg_write(struct kvm_lap case APIC_LVT0: apic_manage_nmi_watchdog(apic, val); - case APIC_LVTT: case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: @@ -794,7 +972,22 @@ static int apic_reg_write(struct kvm_lap break; + case APIC_LVTT: + if ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) != + (val & apic->lapic_timer.timer_mode_mask)) + hrtimer_cancel(&apic->lapic_timer.timer); + + if (!apic_sw_enabled(apic)) + val |= APIC_LVT_MASKED; + val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); + apic_set_reg(apic, APIC_LVTT, val); + break; + case APIC_TMICT: + if (apic_lvtt_tscdeadline(apic)) + break; + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); @@ -881,6 +1074,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcp *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic) + return 0; + + if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + return 0; + + return apic->lapic_timer.tscdeadline; +} + +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic) + return; + + if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + return; + + hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.tscdeadline = data; + start_apic_timer(apic); +} + void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; @@ -968,13 +1187,17 @@ void kvm_lapic_reset(struct kvm_vcpu *vc apic_set_reg(apic, APIC_TMR + 0x10 * i, 0); } apic->irr_pending = false; + apic->isr_count = 0; + apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); if (kvm_vcpu_is_bsp(vcpu)) vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); vcpu->arch.apic_arb_prio = 0; + vcpu->arch.apic_attention = 0; apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, @@ -1015,7 +1238,7 @@ int apic_has_pending_timer(struct kvm_vc return 0; } -static int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) +int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type) { u32 reg = apic_get_reg(apic, lvt_type); int vector, mode, trig_mode; @@ -1110,13 +1333,11 @@ int kvm_apic_accept_pic_intr(struct kvm_ u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0); int r = 0; - if (kvm_vcpu_is_bsp(vcpu)) { - if (!apic_hw_enabled(vcpu->arch.apic)) - r = 1; - if ((lvt0 & APIC_LVT_MASKED) == 0 && - GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) - r = 1; - } + if (!apic_hw_enabled(vcpu->arch.apic)) + r = 1; + if ((lvt0 & APIC_LVT_MASKED) == 0 && + GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT) + r = 1; return r; } @@ -1138,7 +1359,7 @@ int kvm_get_apic_interrupt(struct kvm_vc if (vector == -1) return -1; - apic_set_vector(vector, apic->regs + APIC_ISR); + apic_set_isr(vector, apic); apic_update_ppr(apic); apic_clear_irr(vector, apic); return vector; @@ -1156,6 +1377,9 @@ void kvm_apic_post_state_restore(struct hrtimer_cancel(&apic->lapic_timer.timer); update_divide_count(apic); start_apic_timer(apic); + apic->irr_pending = true; + apic->isr_count = count_vectors(apic->regs + APIC_ISR); + apic->highest_isr_cache = -1; } void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) @@ -1171,32 +1395,104 @@ void __kvm_migrate_apic_timer(struct kvm hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } +/* + * apic_sync_pv_eoi_from_guest - called on vmexit or cancel interrupt + * + * Detect whether guest triggered PV EOI since the + * last entry. If yes, set EOI on guests's behalf. + * Clear PV EOI in guest memory in any case. + */ +static void apic_sync_pv_eoi_from_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + bool pending; + int vector; + /* + * PV EOI state is derived from KVM_APIC_PV_EOI_PENDING in host + * and KVM_PV_EOI_ENABLED in guest memory as follows: + * + * KVM_APIC_PV_EOI_PENDING is unset: + * -> host disabled PV EOI. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is set: + * -> host enabled PV EOI, guest did not execute EOI yet. + * KVM_APIC_PV_EOI_PENDING is set, KVM_PV_EOI_ENABLED is unset: + * -> host enabled PV EOI, guest executed EOI. + */ + BUG_ON(!pv_eoi_enabled(vcpu)); + pending = pv_eoi_get_pending(vcpu); + /* + * Clear pending bit in any case: it will be set again on vmentry. + * While this might not be ideal from performance point of view, + * this makes sure pv eoi is only enabled when we know it's safe. + */ + pv_eoi_clr_pending(vcpu); + if (pending) + return; + vector = apic_set_eoi(apic); + trace_kvm_pv_eoi(apic, vector); +} + void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) { u32 data; - void *vapic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) + if (test_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention)) + apic_sync_pv_eoi_from_guest(vcpu, vcpu->arch.apic); + + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); - data = *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)); - kunmap_atomic(vapic, KM_USER0); + kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); apic_set_tpr(vcpu->arch.apic, data & 0xff); } -void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) +/* + * apic_sync_pv_eoi_to_guest - called before vmentry + * + * Detect whether it's safe to enable PV EOI and + * if yes do so. + */ +static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu, + struct kvm_lapic *apic) +{ + if (!pv_eoi_enabled(vcpu) || + /* IRR set or many bits in ISR: could be nested. */ + apic->irr_pending || + /* Cache not set: could be safe but we don't bother. */ + apic->highest_isr_cache == -1 || + /* Need EOI to update ioapic. */ + kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) { + /* + * PV EOI was disabled by apic_sync_pv_eoi_from_guest + * so we need not do anything here. + */ + return; + } + + pv_eoi_set_pending(apic->vcpu); +} + +void kvm_lapic_prefault_vapic(struct kvm_vcpu *vcpu) +{ + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) + return; + + kvm_fault_in_guest_cached_writable(vcpu->kvm, &vcpu->arch.apic->vapic_cache); +} + +int kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) { u32 data, tpr; int max_irr, max_isr; - struct kvm_lapic *apic; - void *vapic; + struct kvm_lapic *apic = vcpu->arch.apic; - if (!irqchip_in_kernel(vcpu->kvm) || !vcpu->arch.apic->vapic_addr) - return; + apic_sync_pv_eoi_to_guest(vcpu, apic); + + if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) + return 0; - apic = vcpu->arch.apic; tpr = apic_get_reg(apic, APIC_TASKPRI) & 0xff; max_irr = apic_find_highest_irr(apic); if (max_irr < 0) @@ -1206,17 +1502,26 @@ void kvm_lapic_sync_to_vapic(struct kvm_ max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - vapic = kmap_atomic(vcpu->arch.apic->vapic_page, KM_USER0); - *(u32 *)(vapic + offset_in_page(vcpu->arch.apic->vapic_addr)) = data; - kunmap_atomic(vapic, KM_USER0); + return kvm_write_guest_cached_atomic(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) +int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { - if (!irqchip_in_kernel(vcpu->kvm)) - return; + if (vapic_addr) { + if (offset_in_page(vapic_addr) > PAGE_SIZE - 4) + return -EINVAL; + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.apic->vapic_cache, + vapic_addr)) + return -EINVAL; + __set_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); + } else + __clear_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention); vcpu->arch.apic->vapic_addr = vapic_addr; + return 0; } int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data) @@ -1250,3 +1555,16 @@ int kvm_x2apic_msr_read(struct kvm_vcpu return 0; } + +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) +{ + u64 addr = data & ~KVM_MSR_ENABLED; + if (!IS_ALIGNED(addr, 4)) + return 1; + + vcpu->arch.pv_eoi.msr_val = data; + if (!pv_eoi_enabled(vcpu)) + return 0; + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, + addr); +} diff -uprN linux-2.6.32/arch/x86/kvm/lapic.h linux-2.6.32.ovz/arch/x86/kvm/lapic.h --- linux-2.6.32/arch/x86/kvm/lapic.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/lapic.h 2015-03-10 13:24:15.000000000 -0700 @@ -13,10 +13,19 @@ struct kvm_lapic { u32 divide_count; struct kvm_vcpu *vcpu; bool irr_pending; + /* Number of bits set in ISR. */ + s16 isr_count; + /* The highest vector set in ISR; if -1 - invalid, must scan ISR. */ + int highest_isr_cache; + /** + * APIC register page. The layout matches the register layout seen by + * the guest 1:1, because it is accessed by the vmx microcode. + * Note: Only one register, the TPR, is used by the microcode. + */ struct page *regs_page; void *regs; gpa_t vapic_addr; - struct page *vapic_page; + struct gfn_to_hva_cache vapic_cache; }; int kvm_create_lapic(struct kvm_vcpu *vcpu); void kvm_free_lapic(struct kvm_vcpu *vcpu); @@ -34,6 +43,7 @@ void kvm_apic_set_version(struct kvm_vcp int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq); +int kvm_apic_local_deliver(struct kvm_lapic *apic, int lvt_type); u64 kvm_get_apic_base(struct kvm_vcpu *vcpu); void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data); @@ -42,10 +52,15 @@ int kvm_lapic_enabled(struct kvm_vcpu *v bool kvm_apic_present(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); -void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); + +int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); -void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); +int kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); +void kvm_lapic_prefault_vapic(struct kvm_vcpu *vcpu); int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); +int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data); #endif diff -uprN linux-2.6.32/arch/x86/kvm/Makefile linux-2.6.32.ovz/arch/x86/kvm/Makefile --- linux-2.6.32/arch/x86/kvm/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/Makefile 2015-03-10 13:23:08.000000000 -0700 @@ -10,7 +10,7 @@ kvm-y += $(addprefix ../../../virt/kvm kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ - i8254.o timer.o + i8254.o timer.o pmu.o kvm-intel-y += vmx.o kvm-amd-y += svm.o diff -uprN linux-2.6.32/arch/x86/kvm/mmu.c linux-2.6.32.ovz/arch/x86/kvm/mmu.c --- linux-2.6.32/arch/x86/kvm/mmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/mmu.c 2015-03-10 13:24:08.000000000 -0700 @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include @@ -84,7 +86,7 @@ module_param(oos_shadow, bool, 0644); } #endif -#define PT_FIRST_AVAIL_BITS_SHIFT 9 +#define PT_FIRST_AVAIL_BITS_SHIFT 10 #define PT64_SECOND_AVAIL_BITS_SHIFT 52 #define VALID_PAGE(x) ((x) != INVALID_PAGE) @@ -136,16 +138,6 @@ module_param(oos_shadow, bool, 0644); #define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ | PT64_NX_MASK) -#define PFERR_PRESENT_MASK (1U << 0) -#define PFERR_WRITE_MASK (1U << 1) -#define PFERR_USER_MASK (1U << 2) -#define PFERR_RSVD_MASK (1U << 3) -#define PFERR_FETCH_MASK (1U << 4) - -#define PT_PDPE_LEVEL 3 -#define PT_DIRECTORY_LEVEL 2 -#define PT_PAGE_TABLE_LEVEL 1 - #define RMAP_EXT 4 #define ACC_EXEC_MASK 1 @@ -188,6 +180,7 @@ typedef int (*mmu_parent_walk_fn) (struc static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; +static struct percpu_counter kvm_total_used_mmu_pages; static u64 __read_mostly shadow_trap_nonpresent_pte; static u64 __read_mostly shadow_notrap_nonpresent_pte; @@ -227,9 +220,9 @@ void kvm_mmu_set_mask_ptes(u64 user_mask } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -static int is_write_protection(struct kvm_vcpu *vcpu) +static bool is_write_protection(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_WP; + return kvm_read_cr0_bits(vcpu, X86_CR0_WP); } static int is_cpuid_PSE36(void) @@ -470,24 +463,10 @@ static int has_wrprotected_page(struct k static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { - unsigned long page_size = PAGE_SIZE; - struct vm_area_struct *vma; - unsigned long addr; + unsigned long page_size; int i, ret = 0; - addr = gfn_to_hva(kvm, gfn); - if (kvm_is_error_hva(addr)) - return page_size; - - down_read(¤t->mm->mmap_sem); - vma = find_vma(current->mm, addr); - if (!vma) - goto out; - - page_size = vma_kernel_pagesize(vma); - -out: - up_read(¤t->mm->mmap_sem); + page_size = kvm_host_page_size(kvm, gfn); for (i = PT_PAGE_TABLE_LEVEL; i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { @@ -503,8 +482,7 @@ out: static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) { struct kvm_memory_slot *slot; - int host_level; - int level = PT_PAGE_TABLE_LEVEL; + int host_level, level, max_level; slot = gfn_to_memslot(vcpu->kvm, large_gfn); if (slot && slot->dirty_bitmap) @@ -515,11 +493,12 @@ static int mapping_level(struct kvm_vcpu if (host_level == PT_PAGE_TABLE_LEVEL) return host_level; - for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { + max_level = kvm_x86_ops->get_lpage_level() < host_level ? + kvm_x86_ops->get_lpage_level() : host_level; + for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) break; - } return level - 1; } @@ -808,20 +787,15 @@ static int kvm_handle_hva(struct kvm *kv { int i, j; int retval = 0; + struct kvm_memslots *slots; - /* - * If mmap_sem isn't taken, we can look the memslots with only - * the mmu_lock by skipping over the slots with userspace_addr == 0. - */ - for (i = 0; i < kvm->nmemslots; i++) { - struct kvm_memory_slot *memslot = &kvm->memslots[i]; + slots = rcu_dereference(kvm->memslots); + + for (i = 0; i < slots->nmemslots; i++) { + struct kvm_memory_slot *memslot = &slots->memslots[i]; unsigned long start = memslot->userspace_addr; unsigned long end; - /* mmu_lock protects userspace_addr */ - if (!start) - continue; - end = start + (memslot->npages << PAGE_SHIFT); if (hva >= start && hva < end) { gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; @@ -830,8 +804,12 @@ static int kvm_handle_hva(struct kvm *kv data); for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { - int idx = gfn_offset; - idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); + unsigned long idx; + int nr; + + nr = KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL+j); + idx = (memslot->base_gfn+gfn_offset) / nr - + memslot->base_gfn / nr; retval |= handler(kvm, &memslot->lpage_info[j][idx].rmap_pde, data); @@ -858,19 +836,27 @@ static int kvm_age_rmapp(struct kvm *kvm u64 *spte; int young = 0; - /* always return old for EPT */ + /* + * In case of absence of EPT Access and Dirty Bits supports, + * emulate the accessed bit for EPT, by checking if this page has + * an EPT mapping, and clearing it if it does. On the next access, + * a new EPT mapping will be established. + * This has some overhead, but not as much as the cost of swapping + * out actively used pages or breaking up actively used hugepages. + */ if (!shadow_accessed_mask) - return 0; + return kvm_unmap_rmapp(kvm, rmapp, data); spte = rmap_next(kvm, rmapp, NULL); while (spte) { int _young; u64 _spte = *spte; - BUG_ON(!(_spte & PT_PRESENT_MASK)); - _young = _spte & PT_ACCESSED_MASK; + BUG_ON(!is_shadow_present_pte(_spte)); + _young = _spte & shadow_accessed_mask; if (_young) { young = 1; - clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + clear_bit((ffs(shadow_accessed_mask) - 1), + (unsigned long *)spte); } spte = rmap_next(kvm, rmapp, spte); } @@ -914,6 +900,18 @@ static int is_empty_shadow_page(u64 *spt } #endif +/* + * This value is the sum of all of the kvm instances's + * kvm->arch.n_used_mmu_pages values. We need a global, + * aggregate version in order to make the slab shrinker + * faster + */ +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) +{ + kvm->arch.n_used_mmu_pages += nr; + percpu_counter_add(&kvm_total_used_mmu_pages, nr); +} + static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ASSERT(is_empty_shadow_page(sp->spt)); @@ -921,7 +919,7 @@ static void kvm_mmu_free_page(struct kvm __free_page(virt_to_page(sp->spt)); __free_page(virt_to_page(sp->gfns)); kfree(sp); - ++kvm->arch.n_free_mmu_pages; + kvm_mod_used_mmu_pages(kvm, -1); } static unsigned kvm_page_table_hashfn(gfn_t gfn) @@ -943,7 +941,7 @@ static struct kvm_mmu_page *kvm_mmu_allo bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); sp->multimapped = 0; sp->parent_pte = parent_pte; - --vcpu->kvm->arch.n_free_mmu_pages; + kvm_mod_used_mmu_pages(vcpu->kvm, +1); return sp; } @@ -1504,8 +1502,8 @@ static int mmu_zap_unsync_children(struc for_each_sp(pages, sp, parents, i) { kvm_mmu_zap_page(kvm, sp); mmu_pages_clear_parents(&parents); + zapped++; } - zapped += pages.nr; kvm_mmu_pages_init(parent, &parents, &pages); } @@ -1542,35 +1540,27 @@ static int kvm_mmu_zap_page(struct kvm * * Changing the number of mmu pages allocated to the vm * Note: if kvm_nr_mmu_pages is too small, you will get dead lock */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) { - int used_pages; - - used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; - used_pages = max(0, used_pages); - /* * If we set the number of mmu pages to be smaller be than the * number of actived pages , we must to free some mmu pages before we * change the value */ - if (used_pages > kvm_nr_mmu_pages) { - while (used_pages > kvm_nr_mmu_pages) { + if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) { + while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages && + !list_empty(&kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *page; page = container_of(kvm->arch.active_mmu_pages.prev, struct kvm_mmu_page, link); kvm_mmu_zap_page(kvm, page); - used_pages--; } - kvm->arch.n_free_mmu_pages = 0; + goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages; } - else - kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages - - kvm->arch.n_alloc_mmu_pages; - kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; + kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages; } static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) @@ -1610,14 +1600,15 @@ static void mmu_unshadow(struct kvm *kvm && !sp->role.invalid) { pgprintk("%s: zap %lx %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + nn = bucket->first; } } } static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) { - int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); + int slot = memslot_id(kvm, gfn); struct kvm_mmu_page *sp = page_header(__pa(pte)); __set_bit(slot, sp->slot_bitmap); @@ -1641,7 +1632,7 @@ struct page *gva_to_page(struct kvm_vcpu { struct page *page; - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); if (gpa == UNMAPPED_GVA) return NULL; @@ -1842,12 +1833,25 @@ static int set_spte(struct kvm_vcpu *vcp if (level > PT_PAGE_TABLE_LEVEL && has_wrprotected_page(vcpu->kvm, gfn, level)) { ret = 1; + rmap_remove(vcpu->kvm, sptep); spte = shadow_trap_nonpresent_pte; goto set_pte; } spte |= PT_WRITABLE_MASK; + if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) { + spte &= ~PT_USER_MASK; + + /* + * If we converted a user page to a kernel page, + * so that the kernel can write to it when cr0.wp=0, + * then we should prevent the kernel from executing it + * if SMEP is enabled. + */ + if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + spte |= PT64_NX_MASK; + } /* * Optimization: for pte sync, if spte was writable the hash * lookup is unnecessary (and expensive). Write protection @@ -1903,6 +1907,8 @@ static void mmu_set_spte(struct kvm_vcpu child = page_header(pte & PT64_BASE_ADDR_MASK); mmu_page_remove_parent_pte(child, sptep); + __set_spte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); } else if (pfn != spte_to_pfn(*sptep)) { pgprintk("hfn old %lx new %lx\n", spte_to_pfn(*sptep), pfn); @@ -1966,6 +1972,16 @@ static int __direct_map(struct kvm_vcpu break; } + if (is_shadow_present_pte(*iterator.sptep) && + !is_large_pte(*iterator.sptep)) + continue; + + if (is_large_pte(*iterator.sptep)) { + rmap_remove(vcpu->kvm, iterator.sptep); + __set_spte(iterator.sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); + } + if (*iterator.sptep == shadow_trap_nonpresent_pte) { pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr, @@ -1986,6 +2002,31 @@ static int __direct_map(struct kvm_vcpu return pt_write; } +static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk) +{ + siginfo_t info; + + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_MCEERR_AR; + info.si_addr = (void __user *)address; + info.si_addr_lsb = PAGE_SHIFT; + + send_sig_info(SIGBUS, &info, tsk); +} + +static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) +{ + kvm_release_pfn_clean(pfn); + if (is_hwpoison_pfn(pfn)) { + kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current); + return 0; + } else if (is_fault_pfn(pfn)) + return -EFAULT; + + return 1; +} + static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) { int r; @@ -1993,6 +2034,8 @@ static int nonpaging_map(struct kvm_vcpu pfn_t pfn; unsigned long mmu_seq; + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); level = mapping_level(vcpu, gfn); /* @@ -2004,15 +2047,11 @@ static int nonpaging_map(struct kvm_vcpu gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); /* mmio */ - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -2092,21 +2131,24 @@ static int mmu_alloc_roots(struct kvm_vc hpa_t root = vcpu->arch.mmu.root_hpa; ASSERT(!VALID_PAGE(root)); - if (tdp_enabled) - direct = 1; if (mmu_check_root(vcpu, root_gfn)) return 1; + spin_lock(&vcpu->kvm->mmu_lock); + if (tdp_enabled) { + direct = 1; + root_gfn = 0; + } + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu.root_hpa = root; return 0; } direct = !is_paging(vcpu); - if (tdp_enabled) - direct = 1; for (i = 0; i < 4; ++i) { hpa_t root = vcpu->arch.mmu.pae_root[i]; @@ -2122,11 +2164,19 @@ static int mmu_alloc_roots(struct kvm_vc root_gfn = 0; if (mmu_check_root(vcpu, root_gfn)) return 1; + if (tdp_enabled) { + direct = 1; + root_gfn = i << 30; + } + spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_free_some_pages(vcpu); sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL, direct, ACC_ALL, NULL); root = __pa(sp->spt); ++sp->root_count; + spin_unlock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; } vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); @@ -2164,8 +2214,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu spin_unlock(&vcpu->kvm->mmu_lock); } -static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, + u32 access, u32 *error) { + if (error) + *error = 0; return vaddr; } @@ -2205,17 +2258,15 @@ static int tdp_page_fault(struct kvm_vcp if (r) return r; + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); level = mapping_level(vcpu, gfn); gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - if (is_error_pfn(pfn)) { - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; @@ -2433,6 +2484,7 @@ static int init_kvm_tdp_mmu(struct kvm_v static int init_kvm_softmmu(struct kvm_vcpu *vcpu) { int r; + bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ASSERT(vcpu); ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -2447,6 +2499,9 @@ static int init_kvm_softmmu(struct kvm_v r = paging32_init_context(vcpu); vcpu->arch.mmu.base_role.glevels = vcpu->arch.mmu.root_level; + vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); + vcpu->arch.mmu.base_role.smep_andnot_wp + = smep && !is_write_protection(vcpu); return r; } @@ -2484,9 +2539,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) r = mmu_topup_memory_caches(vcpu); if (r) goto out; - spin_lock(&vcpu->kvm->mmu_lock); - kvm_mmu_free_some_pages(vcpu); r = mmu_alloc_roots(vcpu); + spin_lock(&vcpu->kvm->mmu_lock); mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); if (r) @@ -2749,7 +2803,7 @@ int kvm_mmu_unprotect_page_virt(struct k if (tdp_enabled) return 0; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); spin_lock(&vcpu->kvm->mmu_lock); r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); @@ -2760,7 +2814,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES && + while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES && !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { struct kvm_mmu_page *sp; @@ -2771,7 +2825,8 @@ void __kvm_mmu_free_some_pages(struct kv } } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, + void *insn, int insn_len) { int r; enum emulation_result er; @@ -2789,7 +2844,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu * if (r) goto out; - er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); + er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len); switch (er) { case EMULATE_DONE: @@ -2800,6 +2855,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu * case EMULATE_FAIL: vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 0; return 0; default: BUG(); @@ -2885,7 +2941,8 @@ void kvm_mmu_destroy(struct kvm_vcpu *vc mmu_free_memory_caches(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot, + bool destroy_large_pages) { struct kvm_mmu_page *sp; @@ -2896,6 +2953,22 @@ void kvm_mmu_slot_remove_write_access(st if (!test_bit(slot, sp->slot_bitmap)) continue; + if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if (!destroy_large_pages) + continue; + + pt = sp->spt; + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (is_shadow_present_pte(pt[i]) && + is_large_pte(pt[i])) { + --kvm->stat.lpages; + rmap_remove(kvm, &pt[i]); + __set_spte(&pt[i], + shadow_trap_nonpresent_pte); + } + } + continue; + } pt = sp->spt; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) /* avoid RMW */ @@ -2928,39 +3001,38 @@ static void kvm_mmu_remove_one_alloc_mmu kvm_mmu_zap_page(kvm, page); } -static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) +static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) { struct kvm *kvm; struct kvm *kvm_freed = NULL; - int cache_count = 0; + + if (nr_to_scan == 0) + goto out; spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { - int npages; + int idx; - if (!down_read_trylock(&kvm->slots_lock)) - continue; + idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); - npages = kvm->arch.n_alloc_mmu_pages - - kvm->arch.n_free_mmu_pages; - cache_count += npages; - if (!kvm_freed && nr_to_scan > 0 && npages > 0) { + if (!kvm_freed && nr_to_scan > 0 && + kvm->arch.n_used_mmu_pages > 0) { kvm_mmu_remove_one_alloc_mmu_page(kvm); - cache_count--; kvm_freed = kvm; } nr_to_scan--; spin_unlock(&kvm->mmu_lock); - up_read(&kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, idx); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); spin_unlock(&kvm_lock); - return cache_count; +out: + return percpu_counter_read_positive(&kvm_total_used_mmu_pages); } static struct shrinker mmu_shrinker = { @@ -2981,6 +3053,7 @@ static void mmu_destroy_caches(void) void kvm_mmu_module_exit(void) { mmu_destroy_caches(); + percpu_counter_destroy(&kvm_total_used_mmu_pages); unregister_shrinker(&mmu_shrinker); } @@ -3003,6 +3076,9 @@ int kvm_mmu_module_init(void) if (!mmu_page_header_cache) goto nomem; + if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) + goto nomem; + register_shrinker(&mmu_shrinker); return 0; @@ -3020,9 +3096,11 @@ unsigned int kvm_mmu_calculate_mmu_pages int i; unsigned int nr_mmu_pages; unsigned int nr_pages = 0; + struct kvm_memslots *slots; - for (i = 0; i < kvm->nmemslots; i++) - nr_pages += kvm->memslots[i].npages; + slots = rcu_dereference(kvm->memslots); + for (i = 0; i < slots->nmemslots; i++) + nr_pages += slots->memslots[i].npages; nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, @@ -3074,7 +3152,7 @@ static int kvm_pv_mmu_write(struct kvm_v static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) { - kvm_set_cr3(vcpu, vcpu->arch.cr3); + (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); return 1; } @@ -3247,7 +3325,7 @@ static void audit_mappings_page(struct k if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) audit_mappings_page(vcpu, ent, va, level - 1); else { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); + gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL); gfn_t gfn = gpa >> PAGE_SHIFT; pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; @@ -3292,10 +3370,12 @@ static void audit_mappings(struct kvm_vc static int count_rmaps(struct kvm_vcpu *vcpu) { int nmaps = 0; - int i, j, k; + int i, j, k, idx; + idx = srcu_read_lock(&kvm->srcu); + slots = rcu_dereference(kvm->memslots); for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { - struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; + struct kvm_memory_slot *m = &slots->memslots[i]; struct kvm_rmap_desc *d; for (j = 0; j < m->npages; ++j) { @@ -3318,6 +3398,7 @@ static int count_rmaps(struct kvm_vcpu * } } } + srcu_read_unlock(&kvm->srcu, idx); return nmaps; } diff -uprN linux-2.6.32/arch/x86/kvm/mmu.h linux-2.6.32.ovz/arch/x86/kvm/mmu.h --- linux-2.6.32/arch/x86/kvm/mmu.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/mmu.h 2015-03-10 13:24:05.000000000 -0700 @@ -2,6 +2,7 @@ #define __KVM_X86_MMU_H #include +#include "kvm_cache_regs.h" #define PT64_PT_BITS 9 #define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) @@ -37,11 +38,27 @@ #define PT32_ROOT_LEVEL 2 #define PT32E_ROOT_LEVEL 3 +#define PFERR_PRESENT_MASK (1U << 0) +#define PFERR_WRITE_MASK (1U << 1) +#define PFERR_USER_MASK (1U << 2) +#define PFERR_RSVD_MASK (1U << 3) +#define PFERR_FETCH_MASK (1U << 4) + +#define PT_PDPE_LEVEL 3 +#define PT_DIRECTORY_LEVEL 2 +#define PT_PAGE_TABLE_LEVEL 1 + int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); +static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) +{ + return kvm->arch.n_max_mmu_pages - + kvm->arch.n_used_mmu_pages; +} + static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { - if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) + if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES)) __kvm_mmu_free_some_pages(vcpu); } @@ -64,17 +81,17 @@ static inline int is_long_mode(struct kv static inline int is_pae(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr4 & X86_CR4_PAE; + return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); } static inline int is_pse(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr4 & X86_CR4_PSE; + return kvm_read_cr4_bits(vcpu, X86_CR4_PSE); } static inline int is_paging(struct kvm_vcpu *vcpu) { - return vcpu->arch.cr0 & X86_CR0_PG; + return kvm_read_cr0_bits(vcpu, X86_CR0_PG); } static inline int is_present_gpte(unsigned long pte) diff -uprN linux-2.6.32/arch/x86/kvm/paging_tmpl.h linux-2.6.32.ovz/arch/x86/kvm/paging_tmpl.h --- linux-2.6.32/arch/x86/kvm/paging_tmpl.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/paging_tmpl.h 2015-03-10 13:24:08.000000000 -0700 @@ -150,7 +150,9 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); + if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) + goto not_present; + trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) @@ -195,6 +197,12 @@ walk: is_long_mode(vcpu))) { int lvl = walker->level; + /* check if the kernel is fetching from user page */ + if ((pte_access & PT_USER_MASK) && + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (fetch_fault && !user_fault) + goto access_error; + walker->gfn = gpte_to_gfn_lvl(pte, lvl); walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; @@ -242,7 +250,8 @@ err: walker->error_code |= PFERR_WRITE_MASK; if (user_fault) walker->error_code |= PFERR_USER_MASK; - if (fetch_fault) + if (fetch_fault && (is_nx(vcpu) || + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))) walker->error_code |= PFERR_FETCH_MASK; if (rsvd_fault) walker->error_code |= PFERR_RSVD_MASK; @@ -334,6 +343,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu /* advance table_gfn when emulating 1gb pages with 4k */ if (delta == 0) table_gfn += PT_INDEX(addr, level); + access &= gw->pte_access; } else { direct = 0; table_gfn = gw->table_gfn[level - 2]; @@ -412,21 +422,18 @@ static int FNAME(page_fault)(struct kvm_ return 0; } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + smp_rmb(); if (walker.level >= PT_DIRECTORY_LEVEL) { level = min(walker.level, mapping_level(vcpu, walker.gfn)); walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); } - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); /* mmio */ - if (is_error_pfn(pfn)) { - pgprintk("gfn %lx is mmio\n", walker.gfn); - kvm_release_pfn_clean(pfn); - return 1; - } + if (is_error_pfn(pfn)) + return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn); spin_lock(&vcpu->kvm->mmu_lock); if (mmu_notifier_retry(vcpu, mmu_seq)) @@ -455,8 +462,6 @@ out_unlock: static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) { struct kvm_shadow_walk_iterator iterator; - pt_element_t gpte; - gpa_t pte_gpa = -1; int level; u64 *sptep; int need_flush = 0; @@ -471,10 +476,6 @@ static void FNAME(invlpg)(struct kvm_vcp if (level == PT_PAGE_TABLE_LEVEL || ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { - struct kvm_mmu_page *sp = page_header(__pa(sptep)); - - pte_gpa = (sp->gfn << PAGE_SHIFT); - pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t); if (is_shadow_present_pte(*sptep)) { rmap_remove(vcpu->kvm, sptep); @@ -493,32 +494,25 @@ static void FNAME(invlpg)(struct kvm_vcp if (need_flush) kvm_flush_remote_tlbs(vcpu->kvm); spin_unlock(&vcpu->kvm->mmu_lock); - - if (pte_gpa == -1) - return; - if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, - sizeof(pt_element_t))) - return; - if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) { - if (mmu_topup_memory_caches(vcpu)) - return; - kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte, - sizeof(pt_element_t), 0); - } } -static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) +static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, + u32 *error) { struct guest_walker walker; gpa_t gpa = UNMAPPED_GVA; int r; - r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); + r = FNAME(walk_addr)(&walker, vcpu, vaddr, + !!(access & PFERR_WRITE_MASK), + !!(access & PFERR_USER_MASK), + !!(access & PFERR_FETCH_MASK)); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= vaddr & ~PAGE_MASK; - } + } else if (error) + *error = walker.error_code; return gpa; } diff -uprN linux-2.6.32/arch/x86/kvm/pmu.c linux-2.6.32.ovz/arch/x86/kvm/pmu.c --- linux-2.6.32/arch/x86/kvm/pmu.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/pmu.c 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,544 @@ +/* + * Kernel-based Virtual Machine -- Performane Monitoring Unit support + * + * Copyright 2011 Red Hat, Inc. and/or its affiliates. + * + * Authors: + * Avi Kivity + * Gleb Natapov + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include +#include +#include +#include "x86.h" +#include "lapic.h" + +static struct kvm_arch_event_perf_mapping { + u8 eventsel; + u8 unit_mask; + unsigned event_type; + bool inexact; +} arch_events[] = { + /* Index must match CPUID 0x0A.EBX bit vector */ + [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, + [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, + [2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES }, + [3] = { 0x2e, 0x4f, PERF_COUNT_HW_CACHE_REFERENCES }, + [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, + [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, +}; + +/* mapping between fixed pmc index and arch_events array */ +int fixed_pmc_events[] = {1, 0, 7}; + +static bool pmc_is_gp(struct kvm_pmc *pmc) +{ + return pmc->type == KVM_PMC_GP; +} + +static inline u64 pmc_bitmask(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + + return pmu->counter_bitmask[pmc->type]; +} + +static inline bool pmc_enabled(struct kvm_pmc *pmc) +{ + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl); +} + +static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr, + u32 base) +{ + if (msr >= base && msr < base + pmu->nr_arch_gp_counters) + return &pmu->gp_counters[msr - base]; + return NULL; +} + +static inline struct kvm_pmc *get_fixed_pmc(struct kvm_pmu *pmu, u32 msr) +{ + int base = MSR_CORE_PERF_FIXED_CTR0; + if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) + return &pmu->fixed_counters[msr - base]; + return NULL; +} + +static inline struct kvm_pmc *get_fixed_pmc_idx(struct kvm_pmu *pmu, int idx) +{ + return get_fixed_pmc(pmu, MSR_CORE_PERF_FIXED_CTR0 + idx); +} + +static struct kvm_pmc *global_idx_to_pmc(struct kvm_pmu *pmu, int idx) +{ + if (idx < INTEL_PMC_IDX_FIXED) + return get_gp_pmc(pmu, MSR_P6_EVNTSEL0 + idx, MSR_P6_EVNTSEL0); + else + return get_fixed_pmc_idx(pmu, idx - INTEL_PMC_IDX_FIXED); +} + +void kvm_deliver_pmi(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.apic) + kvm_apic_local_deliver(vcpu->arch.apic, APIC_LVTPC); +} + +static void trigger_pmi(struct irq_work *irq_work) +{ + struct kvm_pmu *pmu = container_of(irq_work, struct kvm_pmu, + irq_work); + struct kvm_vcpu *vcpu = container_of(pmu, struct kvm_vcpu, + arch.pmu); + + kvm_deliver_pmi(vcpu); +} + +static void kvm_perf_overflow(struct perf_event *perf_event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); +} + +static void kvm_perf_overflow_intr(struct perf_event *perf_event, + struct perf_sample_data *data, struct pt_regs *regs) +{ + struct kvm_pmc *pmc = perf_event->overflow_handler_context; + struct kvm_pmu *pmu = &pmc->vcpu->arch.pmu; + if (!test_and_set_bit(pmc->idx, (unsigned long *)&pmu->reprogram_pmi)) { + kvm_perf_overflow(perf_event, data, regs); + set_bit(KVM_REQ_PMU, &pmc->vcpu->requests); + /* + * Inject PMI. If vcpu was in a guest mode during NMI PMI + * can be ejected on a guest mode re-entry. Otherwise we can't + * be sure that vcpu wasn't executing hlt instruction at the + * time of vmexit and is not going to re-enter guest mode until, + * woken up. So we should wake it, but this is impossible from + * NMI context. Do it from irq work instead. + */ + if (!kvm_is_in_guest()) + irq_work_queue(&pmc->vcpu->arch.pmu.irq_work); + else + set_bit(KVM_REQ_PMI, &pmc->vcpu->requests); + } +} + +static u64 read_pmc(struct kvm_pmc *pmc) +{ + u64 counter, enabled, running; + + counter = pmc->counter; + + if (pmc->perf_event) + counter += perf_event_read_value(pmc->perf_event, + &enabled, &running); + + /* FIXME: Scaling needed? */ + + return counter & pmc_bitmask(pmc); +} + +static void stop_counter(struct kvm_pmc *pmc) +{ + if (pmc->perf_event) { + pmc->counter = read_pmc(pmc); + perf_event_release_kernel(pmc->perf_event); + pmc->perf_event = NULL; + } +} + +static void reprogram_counter(struct kvm_pmc *pmc, u32 type, + unsigned config, bool exclude_user, bool exclude_kernel, + bool intr) +{ + struct perf_event *event; + struct perf_event_attr attr = { + .type = type, + .size = sizeof(attr), + .pinned = true, + .exclude_idle = true, + .exclude_host = 1, + .exclude_user = exclude_user, + .exclude_kernel = exclude_kernel, + .config = config, + }; + + attr.sample_period = (-pmc->counter) & pmc_bitmask(pmc); + + event = perf_event_create_kernel_counter(&attr, -1, current, + intr ? kvm_perf_overflow_intr : + kvm_perf_overflow, pmc); + if (IS_ERR(event)) { + printk_once("kvm: pmu event creation failed %ld\n", + PTR_ERR(event)); + return; + } + + pmc->perf_event = event; + clear_bit(pmc->idx, (unsigned long*)&pmc->vcpu->arch.pmu.reprogram_pmi); +} + +static unsigned find_arch_event(struct kvm_pmu *pmu, u8 event_select, + u8 unit_mask) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(arch_events); i++) + if (arch_events[i].eventsel == event_select + && arch_events[i].unit_mask == unit_mask + && (pmu->available_event_types & (1 << i))) + break; + + if (i == ARRAY_SIZE(arch_events)) + return PERF_COUNT_HW_MAX; + + return arch_events[i].event_type; +} + +static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) +{ + unsigned config, type = PERF_TYPE_RAW; + u8 event_select, unit_mask; + + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); + + pmc->eventsel = eventsel; + + stop_counter(pmc); + + if (!(eventsel & ARCH_PERFMON_EVENTSEL_ENABLE) || !pmc_enabled(pmc)) + return; + + event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; + unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; + + if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | + ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_CMASK))) { + config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, + unit_mask); + if (config != PERF_COUNT_HW_MAX) + type = PERF_TYPE_HARDWARE; + } + + if (type == PERF_TYPE_RAW) + config = eventsel & X86_RAW_EVENT_MASK; + + reprogram_counter(pmc, type, config, + !(eventsel & ARCH_PERFMON_EVENTSEL_USR), + !(eventsel & ARCH_PERFMON_EVENTSEL_OS), + eventsel & ARCH_PERFMON_EVENTSEL_INT); +} + +static void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 en_pmi, int idx) +{ + unsigned en = en_pmi & 0x3; + bool pmi = en_pmi & 0x8; + + stop_counter(pmc); + + if (!en || !pmc_enabled(pmc)) + return; + + reprogram_counter(pmc, PERF_TYPE_HARDWARE, + arch_events[fixed_pmc_events[idx]].event_type, + !(en & 0x2), /* exclude user */ + !(en & 0x1), /* exclude kernel */ + pmi); +} + +static inline u8 fixed_en_pmi(u64 ctrl, int idx) +{ + return (ctrl >> (idx * 4)) & 0xf; +} + +static void reprogram_fixed_counters(struct kvm_pmu *pmu, u64 data) +{ + int i; + + for (i = 0; i < pmu->nr_arch_fixed_counters; i++) { + u8 en_pmi = fixed_en_pmi(data, i); + struct kvm_pmc *pmc = get_fixed_pmc_idx(pmu, i); + + if (fixed_en_pmi(pmu->fixed_ctr_ctrl, i) == en_pmi) + continue; + + reprogram_fixed_counter(pmc, en_pmi, i); + } + + pmu->fixed_ctr_ctrl = data; +} + +static void reprogram_idx(struct kvm_pmu *pmu, int idx) +{ + struct kvm_pmc *pmc = global_idx_to_pmc(pmu, idx); + + if (!pmc) + return; + + if (pmc_is_gp(pmc)) + reprogram_gp_counter(pmc, pmc->eventsel); + else { + int fidx = idx - INTEL_PMC_IDX_FIXED; + reprogram_fixed_counter(pmc, + fixed_en_pmi(pmu->fixed_ctr_ctrl, fidx), fidx); + } +} + +static void global_ctrl_changed(struct kvm_pmu *pmu, u64 data) +{ + int bit; + u64 diff = pmu->global_ctrl ^ data; + + pmu->global_ctrl = data; + + for_each_bit(bit, (unsigned long *)&diff, X86_PMC_IDX_MAX) + reprogram_idx(pmu, bit); +} + +bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int ret; + + switch (msr) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + case MSR_CORE_PERF_GLOBAL_STATUS: + case MSR_CORE_PERF_GLOBAL_CTRL: + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + ret = pmu->version > 1; + break; + default: + ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) + || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) + || get_fixed_pmc(pmu, msr); + break; + } + return ret; +} + +int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + + switch (index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + *data = pmu->fixed_ctr_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_STATUS: + *data = pmu->global_status; + return 0; + case MSR_CORE_PERF_GLOBAL_CTRL: + *data = pmu->global_ctrl; + return 0; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + *data = pmu->global_ovf_ctrl; + return 0; + default: + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, index))) { + *data = read_pmc(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { + *data = pmc->eventsel; + return 0; + } + } + return 1; +} + +int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_pmc *pmc; + u32 index = msr_info->index; + u64 data = msr_info->data; + + switch (index) { + case MSR_CORE_PERF_FIXED_CTR_CTRL: + if (pmu->fixed_ctr_ctrl == data) + return 0; + if (!(data & 0xfffffffffffff444)) { + reprogram_fixed_counters(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_STATUS: + if (msr_info->host_initiated) { + pmu->global_status = data; + return 0; + } + break; /* RO MSR */ + case MSR_CORE_PERF_GLOBAL_CTRL: + if (pmu->global_ctrl == data) + return 0; + if (!(data & pmu->global_ctrl_mask)) { + global_ctrl_changed(pmu, data); + return 0; + } + break; + case MSR_CORE_PERF_GLOBAL_OVF_CTRL: + if (!(data & (pmu->global_ctrl_mask & ~(3ull<<62)))) { + if (!msr_info->host_initiated) + pmu->global_status &= ~data; + pmu->global_ovf_ctrl = data; + return 0; + } + break; + default: + if ((pmc = get_gp_pmc(pmu, index, MSR_IA32_PERFCTR0)) || + (pmc = get_fixed_pmc(pmu, index))) { + if (!msr_info->host_initiated) + data = (s64)(s32)data; + pmc->counter += data - read_pmc(pmc); + return 0; + } else if ((pmc = get_gp_pmc(pmu, index, MSR_P6_EVNTSEL0))) { + if (data == pmc->eventsel) + return 0; + if (!(data & 0xffffffff00200000ull)) { + reprogram_gp_counter(pmc, data); + return 0; + } + } + } + return 1; +} + +int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool fast_mode = pmc & (1u << 31); + bool fixed = pmc & (1u << 30); + struct kvm_pmc *counters; + u64 ctr; + + pmc &= ~(3u << 30); + if (!fixed && pmc >= pmu->nr_arch_gp_counters) + return 1; + if (fixed && pmc >= pmu->nr_arch_fixed_counters) + return 1; + counters = fixed ? pmu->fixed_counters : pmu->gp_counters; + ctr = read_pmc(&counters[pmc]); + if (fast_mode) + ctr = (u32)ctr; + *data = ctr; + + return 0; +} + +void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_cpuid_entry2 *entry; + unsigned bitmap_len; + + pmu->nr_arch_gp_counters = 0; + pmu->nr_arch_fixed_counters = 0; + pmu->counter_bitmask[KVM_PMC_GP] = 0; + pmu->counter_bitmask[KVM_PMC_FIXED] = 0; + pmu->version = 0; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return; + + pmu->version = entry->eax & 0xff; + if (!pmu->version) + return; + + pmu->nr_arch_gp_counters = min((int)(entry->eax >> 8) & 0xff, + INTEL_PMC_MAX_GENERIC); + pmu->counter_bitmask[KVM_PMC_GP] = + ((u64)1 << ((entry->eax >> 16) & 0xff)) - 1; + bitmap_len = (entry->eax >> 24) & 0xff; + pmu->available_event_types = ~entry->ebx & ((1ull << bitmap_len) - 1); + + if (pmu->version == 1) { + pmu->global_ctrl = (1 << pmu->nr_arch_gp_counters) - 1; + return; + } + + pmu->nr_arch_fixed_counters = min((int)(entry->edx & 0x1f), + INTEL_PMC_MAX_FIXED); + pmu->counter_bitmask[KVM_PMC_FIXED] = + ((u64)1 << ((entry->edx >> 5) & 0xff)) - 1; + pmu->global_ctrl_mask = ~(((1 << pmu->nr_arch_gp_counters) - 1) + | (((1ull << pmu->nr_arch_fixed_counters) - 1) + << INTEL_PMC_IDX_FIXED)); +} + +void kvm_pmu_init(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_pmu *pmu = &vcpu->arch.pmu; + + memset(pmu, 0, sizeof(*pmu)); + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + pmu->gp_counters[i].type = KVM_PMC_GP; + pmu->gp_counters[i].vcpu = vcpu; + pmu->gp_counters[i].idx = i; + } + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + pmu->fixed_counters[i].type = KVM_PMC_FIXED; + pmu->fixed_counters[i].vcpu = vcpu; + pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED; + } + init_irq_work(&pmu->irq_work, trigger_pmi); + kvm_pmu_cpuid_update(vcpu); +} + +void kvm_pmu_reset(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + int i; + + irq_work_sync(&pmu->irq_work); + for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { + struct kvm_pmc *pmc = &pmu->gp_counters[i]; + stop_counter(pmc); + pmc->counter = pmc->eventsel = 0; + } + + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) + stop_counter(&pmu->fixed_counters[i]); + + pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = + pmu->global_ovf_ctrl = 0; +} + +void kvm_pmu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_pmu_reset(vcpu); +} + +void kvm_handle_pmu_event(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + u64 bitmask; + int bit; + + bitmask = pmu->reprogram_pmi; + + for_each_bit(bit, (unsigned long *)&bitmask, X86_PMC_IDX_MAX) { + struct kvm_pmc *pmc = global_idx_to_pmc(pmu, bit); + + if (unlikely(!pmc || !pmc->perf_event)) { + clear_bit(bit, (unsigned long *)&pmu->reprogram_pmi); + continue; + } + + reprogram_idx(pmu, bit); + } +} diff -uprN linux-2.6.32/arch/x86/kvm/svm.c linux-2.6.32.ovz/arch/x86/kvm/svm.c --- linux-2.6.32/arch/x86/kvm/svm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/svm.c 2015-03-10 13:24:14.000000000 -0700 @@ -27,6 +27,8 @@ #include #include +#include +#include #include #include @@ -43,9 +45,14 @@ MODULE_LICENSE("GPL"); #define SEG_TYPE_LDT 2 #define SEG_TYPE_BUSY_TSS16 3 -#define SVM_FEATURE_NPT (1 << 0) -#define SVM_FEATURE_LBRV (1 << 1) -#define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_NPT (1 << 0) +#define SVM_FEATURE_LBRV (1 << 1) +#define SVM_FEATURE_SVML (1 << 2) +#define SVM_FEATURE_NRIP (1 << 3) +#define SVM_FEATURE_TSC_RATE (1 << 4) +#define SVM_FEATURE_FLUSH_ASID (1 << 6) +#define SVM_FEATURE_DECODE_ASSIST (1 << 7) +#define SVM_FEATURE_PAUSE_FILTER (1 << 10) #define NESTED_EXIT_HOST 0 /* Exit handled on host level */ #define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ @@ -62,6 +69,12 @@ MODULE_LICENSE("GPL"); #define nsvm_printk(fmt, args...) do {} while(0) #endif +#define TSC_RATIO_RSVD 0xffffff0000000000ULL +#define TSC_RATIO_MIN 0x0000000000000001ULL +#define TSC_RATIO_MAX 0x000000ffffffffffULL + +static bool erratum_383_found __read_mostly; + static const u32 host_save_user_msrs[] = { #ifdef CONFIG_X86_64 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, @@ -85,6 +98,9 @@ struct nested_state { /* gpa pointers to the real vectors */ u64 vmcb_msrpm; + /* A VMEXIT is required but not yet emulated */ + bool exit_required; + /* cache for intercepts of the guest */ u16 intercept_cr_read; u16 intercept_cr_write; @@ -111,9 +127,16 @@ struct vcpu_svm { u32 *msrpm; + ulong nmi_iret_rip; + struct nested_state nested; + + u64 tsc_ratio; }; +static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define TSC_RATIO_DEFAULT 0x100000000ULL + /* enable NPT for AMD64 and X86 with PAE */ #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) static bool npt_enabled = true; @@ -134,6 +157,42 @@ static int nested_svm_exit_handled(struc static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); +static u64 __scale_tsc(u64 ratio, u64 tsc); + +enum { + VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, + pause filter count */ + VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */ + VMCB_ASID, /* ASID */ + VMCB_INTR, /* int_ctl, int_vector */ + VMCB_NPT, /* npt_en, nCR3, gPAT */ + VMCB_CR, /* CR0, CR3, CR4, EFER */ + VMCB_DR, /* DR6, DR7 */ + VMCB_DT, /* GDT, IDT */ + VMCB_SEG, /* CS, DS, SS, ES, CPL */ + VMCB_CR2, /* CR2 only */ + VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ + VMCB_DIRTY_MAX, +}; + +/* TPR and CR2 are always written before VMRUN */ +#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) + +static inline void mark_all_dirty(struct vmcb *vmcb) +{ + vmcb->control.clean = 0; +} + +static inline void mark_all_clean(struct vmcb *vmcb) +{ + vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1) + & ~VMCB_ALWAYS_DIRTY_MASK; +} + +static inline void mark_dirty(struct vmcb *vmcb, int bit) +{ + vmcb->control.clean &= ~(1 << bit); +} static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { @@ -218,16 +277,6 @@ static inline void invlpga(unsigned long asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); } -static inline void force_new_asid(struct kvm_vcpu *vcpu) -{ - to_svm(vcpu)->asid_generation--; -} - -static inline void flush_guest_tlb(struct kvm_vcpu *vcpu) -{ - force_new_asid(vcpu); -} - static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) { if (!npt_enabled && !(efer & EFER_LMA)) @@ -235,6 +284,7 @@ static void svm_set_efer(struct kvm_vcpu to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; vcpu->arch.shadow_efer = efer; + mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); } static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, @@ -285,8 +335,11 @@ static void skip_emulated_instruction(st { struct vcpu_svm *svm = to_svm(vcpu); + if (svm->vmcb->control.next_rip != 0) + svm->next_rip = svm->vmcb->control.next_rip; + if (!svm->next_rip) { - if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != + if (emulate_instruction(vcpu, EMULTYPE_SKIP) != EMULATE_DONE) printk(KERN_DEBUG "%s: NOP\n", __func__); return; @@ -299,6 +352,31 @@ static void skip_emulated_instruction(st svm_set_interrupt_shadow(vcpu, 0); } +static void svm_init_erratum_383(void) +{ + u32 low, high; + int err; + u64 val; + + /* Only Fam10h is affected */ + if (boot_cpu_data.x86 != 0x10) + return; + + /* Use _safe variants to not break nested virtualization */ + val = native_read_msr_safe(MSR_AMD64_DC_CFG, &err); + if (err) + return; + + val |= (1ULL << 47); + + low = lower_32_bits(val); + high = upper_32_bits(val); + + native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); + + erratum_383_found = true; +} + static int has_svm(void) { const char *msg; @@ -313,10 +391,16 @@ static int has_svm(void) static void svm_hardware_disable(void *garbage) { + /* Make sure we clean up behind us */ + if (svm_has(SVM_FEATURE_TSC_RATE)) + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + cpu_svm_disable(); + + amd_pmu_disable_virt(); } -static void svm_hardware_enable(void *garbage) +static int svm_hardware_enable(void *garbage) { struct svm_cpu_data *svm_data; @@ -325,16 +409,20 @@ static void svm_hardware_enable(void *ga struct desc_struct *gdt; int me = raw_smp_processor_id(); + rdmsrl(MSR_EFER, efer); + if (efer & EFER_SVME) + return -EBUSY; + if (!has_svm()) { printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); - return; + return -EINVAL; } svm_data = per_cpu(svm_data, me); if (!svm_data) { printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", me); - return; + return -EINVAL; } svm_data->asid_generation = 1; @@ -345,11 +433,21 @@ static void svm_hardware_enable(void *ga gdt = (struct desc_struct *)gdt_descr.base; svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); - rdmsrl(MSR_EFER, efer); wrmsrl(MSR_EFER, efer | EFER_SVME); wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(svm_data->save_area) << PAGE_SHIFT); + + if (svm_has(SVM_FEATURE_TSC_RATE)) { + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; + } + + svm_init_erratum_383(); + + amd_pmu_enable_virt(); + + return 0; } static void svm_cpu_uninit(int cpu) @@ -476,7 +574,7 @@ static __init int svm_hardware_setup(voi kvm_enable_efer_bits(EFER_SVME); } - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) goto err; @@ -484,6 +582,23 @@ static __init int svm_hardware_setup(voi svm_features = cpuid_edx(SVM_CPUID_FUNC); + if (svm_has(SVM_FEATURE_TSC_RATE)) { + u64 max; + + kvm_has_tsc_control = true; + + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated needed because it will always + * be 1 on all machines and a value of 0 is used to disable + * tsc-scaling for the vcpu. + */ + max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); + + kvm_max_guest_tsc_khz = max; + } + if (!svm_has(SVM_FEATURE_NPT)) npt_enabled = false; @@ -510,7 +625,7 @@ static __exit void svm_hardware_unsetup( { int cpu; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); @@ -534,11 +649,124 @@ static void init_sys_seg(struct vmcb_seg seg->base = 0; } +static u64 __scale_tsc(u64 ratio, u64 tsc) +{ + u64 mult, frac, _tsc; + + mult = ratio >> 32; + frac = ratio & ((1ULL << 32) - 1); + + _tsc = tsc; + _tsc *= mult; + _tsc += (tsc >> 32) * frac; + _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; + + return _tsc; +} + +static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 _tsc = tsc; + + if (svm->tsc_ratio != TSC_RATIO_DEFAULT) + _tsc = __scale_tsc(svm->tsc_ratio, tsc); + + return _tsc; +} + +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 ratio; + u64 khz; + + /* TSC scaling supported? */ + if (!svm_has(SVM_FEATURE_TSC_RATE)) + return; + + /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ + if (user_tsc_khz == 0) { + vcpu->arch.virtual_tsc_khz = 0; + svm->tsc_ratio = TSC_RATIO_DEFAULT; + return; + } + + khz = user_tsc_khz; + + /* TSC scaling required - calculate ratio */ + ratio = khz << 32; + do_div(ratio, tsc_khz); + + if (ratio == 0 || ratio & TSC_RATIO_RSVD) { + WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); + return; + } + vcpu->arch.virtual_tsc_khz = user_tsc_khz; + svm->tsc_ratio = ratio; +} + +static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 g_tsc_offset = 0; + + if (is_nested(svm)) { + g_tsc_offset = svm->vmcb->control.tsc_offset - + svm->nested.hsave->control.tsc_offset; + svm->nested.hsave->control.tsc_offset = offset; + } else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset, + offset); + + svm->vmcb->control.tsc_offset = offset + g_tsc_offset; + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); +} + +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) +{ + struct vcpu_svm *svm = to_svm(vcpu); + bool negative = false; + + if (adjustment < 0) { + adjustment = -adjustment; + negative = true; + } + if (host) + adjustment = svm_scale_tsc(vcpu, adjustment); + + if (negative) { + adjustment = -adjustment; + } + + svm->vmcb->control.tsc_offset += adjustment; + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + if (is_nested(svm)) + svm->nested.hsave->control.tsc_offset += adjustment; + else + trace_kvm_write_tsc_offset(vcpu->vcpu_id, + svm->vmcb->control.tsc_offset - adjustment, + svm->vmcb->control.tsc_offset); +} + +static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + u64 tsc; + + tsc = svm_scale_tsc(vcpu, native_read_tsc()); + + return target_tsc - tsc; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; + svm->vcpu.fpu_active = 1; + control->intercept_cr_read = INTERCEPT_CR0_MASK | INTERCEPT_CR3_MASK | INTERCEPT_CR4_MASK; @@ -568,6 +796,7 @@ static void init_vmcb(struct vcpu_svm *s control->intercept = (1ULL << INTERCEPT_INTR) | (1ULL << INTERCEPT_NMI) | (1ULL << INTERCEPT_SMI) | + (1ULL << INTERCEPT_RDPMC) | (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | @@ -586,11 +815,11 @@ static void init_vmcb(struct vcpu_svm *s (1ULL << INTERCEPT_SKINIT) | (1ULL << INTERCEPT_WBINVD) | (1ULL << INTERCEPT_MONITOR) | - (1ULL << INTERCEPT_MWAIT); + (1ULL << INTERCEPT_MWAIT) | + (1ULL << INTERCEPT_XSETBV); control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); - control->tsc_offset = 0; control->int_ctl = V_INTR_MASKING_MASK; init_seg(&save->es); @@ -618,18 +847,19 @@ static void init_vmcb(struct vcpu_svm *s init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - save->efer = EFER_SVME; + svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; save->dr7 = 0x400; save->rflags = 2; save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; - /* - * cr0 val on cpu init should be 0x60000010, we enable cpu - * cache by default. the orderly way is to enable cache in bios. + /* This is the guest-visible cr0 value. + * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. */ - save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; + svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); + save->cr4 = X86_CR4_PAE; /* rdx = ?? */ @@ -644,16 +874,21 @@ static void init_vmcb(struct vcpu_svm *s control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK| INTERCEPT_CR3_MASK); save->g_pat = 0x0007040600070406ULL; - /* enable caching because the QEMU Bios doesn't enable it */ - save->cr0 = X86_CR0_ET; save->cr3 = 0; save->cr4 = 0; } - force_new_asid(&svm->vcpu); + svm->asid_generation = 0; svm->nested.vmcb = 0; svm->vcpu.arch.hflags = 0; + if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { + control->pause_filter_count = 3000; + control->intercept |= (1ULL << INTERCEPT_PAUSE); + } + + mark_all_dirty(svm->vmcb); + enable_gif(svm); } @@ -667,6 +902,7 @@ static int svm_vcpu_reset(struct kvm_vcp kvm_rip_write(vcpu, 0); svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; + mark_dirty(svm->vmcb, VMCB_SEG); } vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; @@ -689,33 +925,34 @@ static struct kvm_vcpu *svm_create_vcpu( goto out; } + svm->tsc_ratio = TSC_RATIO_DEFAULT; + err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; + err = -ENOMEM; page = alloc_page(GFP_KERNEL); - if (!page) { - err = -ENOMEM; + if (!page) goto uninit; - } - err = -ENOMEM; msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!msrpm_pages) - goto uninit; + goto free_page1; nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); if (!nested_msrpm_pages) - goto uninit; - - svm->msrpm = page_address(msrpm_pages); - svm_vcpu_init_msrpm(svm->msrpm); + goto free_page2; hsave_page = alloc_page(GFP_KERNEL); if (!hsave_page) - goto uninit; + goto free_page3; + svm->nested.hsave = page_address(hsave_page); + svm->msrpm = page_address(msrpm_pages); + svm_vcpu_init_msrpm(svm->msrpm); + svm->nested.msrpm = page_address(nested_msrpm_pages); svm->vmcb = page_address(page); @@ -725,13 +962,18 @@ static struct kvm_vcpu *svm_create_vcpu( init_vmcb(svm); fx_init(&svm->vcpu); - svm->vcpu.fpu_active = 1; svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; return &svm->vcpu; +free_page3: + __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); +free_page2: + __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER); +free_page1: + __free_page(page); uninit: kvm_vcpu_uninit(&svm->vcpu); free_svm: @@ -758,24 +1000,18 @@ static void svm_vcpu_load(struct kvm_vcp int i; if (unlikely(cpu != vcpu->cpu)) { - u64 tsc_this, delta; - - /* - * Make sure that the guest sees a monotonically - * increasing TSC. - */ - rdtscll(tsc_this); - delta = vcpu->arch.host_tsc - tsc_this; - svm->vmcb->control.tsc_offset += delta; - if (is_nested(svm)) - svm->nested.hsave->control.tsc_offset += delta; - vcpu->cpu = cpu; - kvm_migrate_timers(vcpu); svm->asid_generation = 0; + mark_all_dirty(svm->vmcb); } for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + if (svm_has(SVM_FEATURE_TSC_RATE) && + svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { + __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; + wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); + } } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -786,8 +1022,6 @@ static void svm_vcpu_put(struct kvm_vcpu ++vcpu->stat.host_state_reload; for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - - rdtscll(vcpu->arch.host_tsc); } static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) @@ -815,11 +1049,13 @@ static void svm_cache_reg(struct kvm_vcp static void svm_set_vintr(struct vcpu_svm *svm) { svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } static void svm_clear_vintr(struct vcpu_svm *svm) { svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) @@ -932,6 +1168,7 @@ static void svm_set_idt(struct kvm_vcpu svm->vmcb->save.idtr.limit = dt->limit; svm->vmcb->save.idtr.base = dt->base ; + mark_dirty(svm->vmcb, VMCB_DT); } static void svm_get_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) @@ -948,6 +1185,11 @@ static void svm_set_gdt(struct kvm_vcpu svm->vmcb->save.gdtr.limit = dt->limit; svm->vmcb->save.gdtr.base = dt->base ; + mark_dirty(svm->vmcb, VMCB_DT); +} + +static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ } static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) @@ -974,17 +1216,11 @@ static void svm_set_cr0(struct kvm_vcpu if (npt_enabled) goto set; - if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { - svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - vcpu->fpu_active = 1; - } - vcpu->arch.cr0 = cr0; cr0 |= X86_CR0_PG | X86_CR0_WP; - if (!vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); + + if (!vcpu->fpu_active) cr0 |= X86_CR0_TS; - } set: /* * re-enable caching here because the QEMU bios @@ -993,6 +1229,7 @@ set: */ cr0 &= ~(X86_CR0_CD | X86_CR0_NW); svm->vmcb->save.cr0 = cr0; + mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); } static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1001,13 +1238,14 @@ static void svm_set_cr4(struct kvm_vcpu unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - force_new_asid(vcpu); + svm_flush_tlb(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) cr4 |= X86_CR4_PAE; cr4 |= host_cr4_mce; to_svm(vcpu)->vmcb->save.cr4 = cr4; + mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); } static void svm_set_segment(struct kvm_vcpu *vcpu, @@ -1036,6 +1274,7 @@ static void svm_set_segment(struct kvm_v = (svm->vmcb->save.cs.attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; + mark_dirty(svm->vmcb, VMCB_SEG); } static void update_db_intercept(struct kvm_vcpu *vcpu) @@ -1058,6 +1297,8 @@ static void update_db_intercept(struct k 1 << BP_VECTOR; } else vcpu->guest_debug = 0; + + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) @@ -1073,6 +1314,7 @@ static int svm_guest_debug(struct kvm_vc svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; else svm->vmcb->save.dr7 = vcpu->arch.dr7; + mark_dirty(svm->vmcb, VMCB_DR); if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF; @@ -1106,6 +1348,8 @@ static void new_asid(struct vcpu_svm *sv svm->asid_generation = svm_data->asid_generation; svm->vmcb->control.asid = svm_data->next_asid++; + + mark_dirty(svm->vmcb, VMCB_ASID); } static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) @@ -1170,6 +1414,7 @@ static void svm_set_dr(struct kvm_vcpu * svm->vmcb->save.dr7 = vcpu->arch.dr7; vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); } + mark_dirty(svm->vmcb, VMCB_DR); return; default: /* FIXME: Possible case? */ @@ -1180,7 +1425,7 @@ static void svm_set_dr(struct kvm_vcpu * } } -static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int pf_interception(struct vcpu_svm *svm) { u64 fault_address; u32 error_code; @@ -1191,11 +1436,15 @@ static int pf_interception(struct vcpu_s trace_kvm_page_fault(fault_address, error_code); if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); - return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, + svm->vmcb->control.insn_bytes, + svm->vmcb->control.insn_len); } -static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int db_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->vcpu.arch.singlestep) { @@ -1223,36 +1472,93 @@ static int db_interception(struct vcpu_s return 1; } -static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int bp_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; kvm_run->debug.arch.exception = BP_VECTOR; return 0; } -static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int ud_interception(struct vcpu_svm *svm) { int er; - er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } -static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int nm_interception(struct vcpu_svm *svm) { svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); - if (!(svm->vcpu.arch.cr0 & X86_CR0_TS)) + if (!kvm_read_cr0_bits(&svm->vcpu, X86_CR0_TS)) svm->vmcb->save.cr0 &= ~X86_CR0_TS; + else + svm->vmcb->save.cr0 |= X86_CR0_TS; svm->vcpu.fpu_active = 1; + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + mark_dirty(svm->vmcb, VMCB_CR); return 1; } -static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static bool is_erratum_383(void) +{ + int err, i; + u64 value; + + if (!erratum_383_found) + return false; + + value = native_read_msr_safe(MSR_IA32_MC0_STATUS, &err); + if (err) + return false; + + /* Bit 62 may or may not be set for this mce */ + value &= ~(1ULL << 62); + + if (value != 0xb600000000010015) + return false; + + /* Clear MCi_STATUS registers */ + for (i = 0; i < 6; ++i) + native_write_msr_safe(MSR_IA32_MCx_STATUS(i), 0, 0); + + value = native_read_msr_safe(MSR_IA32_MCG_STATUS, &err); + if (!err) { + u32 low, high; + + value &= ~(1ULL << 2); + low = lower_32_bits(value); + high = upper_32_bits(value); + + native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); + } + + /* Flush tlb to evict multi-match entries */ + __flush_tlb_all(); + + return true; +} + +static void svm_handle_mce(struct vcpu_svm *svm) { + if (is_erratum_383()) { + /* + * Erratum 383 triggered. Guest state is corrupt so kill the + * guest. + */ + pr_err("KVM: Guest triggered AMD Erratum 383\n"); + + set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests); + + return; + } + /* * On an #MC intercept the MCE handler is not called automatically in * the host. So do it by hand here. @@ -1261,11 +1567,18 @@ static int mc_interception(struct vcpu_s "int $0x12\n"); /* not sure if we ever come back to this point */ + return; +} + +static int mc_interception(struct vcpu_svm *svm) +{ return 1; } -static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int shutdown_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + /* * VMCB is undefined after a SHUTDOWN intercept * so reinitialize it. @@ -1277,7 +1590,7 @@ static int shutdown_interception(struct return 0; } -static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int io_interception(struct vcpu_svm *svm) { u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ int size, in, string; @@ -1290,8 +1603,7 @@ static int io_interception(struct vcpu_s string = (io_info & SVM_IOIO_STR_MASK) != 0; if (string) { - if (emulate_instruction(&svm->vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(&svm->vcpu, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -1301,33 +1613,33 @@ static int io_interception(struct vcpu_s size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; skip_emulated_instruction(&svm->vcpu); - return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); + return kvm_emulate_pio(&svm->vcpu, in, size, port); } -static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int nmi_interception(struct vcpu_svm *svm) { return 1; } -static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int intr_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.irq_exits; return 1; } -static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int nop_on_interception(struct vcpu_svm *svm) { return 1; } -static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int halt_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu); } -static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmmcall_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); @@ -1378,7 +1690,14 @@ static inline int nested_svm_intr(struct svm->vmcb->control.exit_code = SVM_EXIT_INTR; - if (nested_svm_exit_handled(svm)) { + if (svm->nested.intercept & 1ULL) { + /* + * The #vmexit can't be emulated here directly because this + * code path runs with irqs and preemtion disabled. A + * #vmexit emulation might sleep. Only signal request for + * the #vmexit here. + */ + svm->nested.exit_required = true; nsvm_printk("VMexit -> INTR\n"); return 1; } @@ -1390,10 +1709,7 @@ static void *nested_svm_map(struct vcpu_ { struct page *page; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); - if (is_error_page(page)) goto error; @@ -1617,6 +1933,22 @@ static int nested_svm_vmexit(struct vcpu nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; + + /* + * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have + * to make sure that we do not lose injected events. So check event_inj + * here and copy it to exit_int_info if it is valid. + * Exit_int_info and event_inj can't be both valid because the case + * below only happens on a VMRUN instruction intercept which has + * no valid exit_int_info set. + */ + if (vmcb->control.event_inj & SVM_EVTINJ_VALID) { + struct vmcb_control_area *nc = &nested_vmcb->control; + + nc->exit_int_info = vmcb->control.event_inj; + nc->exit_int_info_err = vmcb->control.event_inj_err; + } + nested_vmcb->control.tlb_ctl = 0; nested_vmcb->control.event_inj = 0; nested_vmcb->control.event_inj_err = 0; @@ -1650,8 +1982,9 @@ static int nested_svm_vmexit(struct vcpu svm->vmcb->save.cr3 = hsave->save.cr3; svm->vcpu.arch.cr3 = hsave->save.cr3; } else { - kvm_set_cr3(&svm->vcpu, hsave->save.cr3); + (void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3); } + mark_dirty(svm->vmcb, VMCB_CR); kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp); kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip); @@ -1662,6 +1995,8 @@ static int nested_svm_vmexit(struct vcpu /* Exit nested SVM mode */ svm->nested.vmcb = 0; + mark_all_dirty(svm->vmcb); + nested_svm_unmap(nested_vmcb, KM_USER0); kvm_mmu_reset_context(&svm->vcpu); @@ -1715,7 +2050,7 @@ static bool nested_svm_vmrun(struct vcpu hsave->save.gdtr = vmcb->save.gdtr; hsave->save.idtr = vmcb->save.idtr; hsave->save.efer = svm->vcpu.arch.shadow_efer; - hsave->save.cr0 = svm->vcpu.arch.cr0; + hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; hsave->save.rflags = vmcb->save.rflags; hsave->save.rip = svm->next_rip; @@ -1748,9 +2083,10 @@ static bool nested_svm_vmrun(struct vcpu svm->vmcb->save.cr3 = nested_vmcb->save.cr3; svm->vcpu.arch.cr3 = nested_vmcb->save.cr3; } else { - kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); + (void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3); kvm_mmu_reset_context(&svm->vcpu); } + mark_dirty(svm->vmcb, VMCB_CR); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2; kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax); kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp); @@ -1788,7 +2124,7 @@ static bool nested_svm_vmrun(struct vcpu svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; svm->nested.intercept = nested_vmcb->control.intercept; - force_new_asid(&svm->vcpu); + svm_flush_tlb(&svm->vcpu); svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info; svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err; svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; @@ -1818,6 +2154,8 @@ static bool nested_svm_vmrun(struct vcpu enable_gif(svm); + mark_all_dirty(svm->vmcb); + return true; } @@ -1837,7 +2175,7 @@ static void nested_svm_vmloadsave(struct to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; } -static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmload_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; @@ -1857,7 +2195,7 @@ static int vmload_interception(struct vc return 1; } -static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmsave_interception(struct vcpu_svm *svm) { struct vmcb *nested_vmcb; @@ -1877,7 +2215,7 @@ static int vmsave_interception(struct vc return 1; } -static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int vmrun_interception(struct vcpu_svm *svm) { nsvm_printk("VMrun\n"); @@ -1907,7 +2245,7 @@ failed: return 1; } -static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int stgi_interception(struct vcpu_svm *svm) { if (nested_svm_check_permissions(svm)) return 1; @@ -1920,7 +2258,7 @@ static int stgi_interception(struct vcpu return 1; } -static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int clgi_interception(struct vcpu_svm *svm) { if (nested_svm_check_permissions(svm)) return 1; @@ -1933,11 +2271,12 @@ static int clgi_interception(struct vcpu /* After a CLGI no interrupts should come */ svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + mark_dirty(svm->vmcb, VMCB_INTR); return 1; } -static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int invlpga_interception(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; nsvm_printk("INVLPGA\n"); @@ -1950,15 +2289,26 @@ static int invlpga_interception(struct v return 1; } -static int invalid_op_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int xsetbv_interception(struct vcpu_svm *svm) +{ + u64 new_bv = kvm_read_edx_eax(&svm->vcpu); + u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX); + + if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) { + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + } + + return 1; +} + +static int invalid_op_interception(struct vcpu_svm *svm) { kvm_queue_exception(&svm->vcpu, UD_VECTOR); return 1; } -static int task_switch_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int task_switch_interception(struct vcpu_svm *svm) { u16 tss_selector; int reason; @@ -1969,6 +2319,8 @@ static int task_switch_interception(stru svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; uint32_t idt_v = svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; + bool has_error_code = false; + u32 error_code = 0; tss_selector = (u16)svm->vmcb->control.exit_info_1; @@ -1989,6 +2341,12 @@ static int task_switch_interception(stru svm->vcpu.arch.nmi_injected = false; break; case SVM_EXITINTINFO_TYPE_EXEPT: + if (svm->vmcb->control.exit_info_2 & + (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { + has_error_code = true; + error_code = + (u32)svm->vmcb->control.exit_info_2; + } kvm_clear_exception_queue(&svm->vcpu); break; case SVM_EXITINTINFO_TYPE_INTR: @@ -2005,50 +2363,175 @@ static int task_switch_interception(stru (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - return kvm_task_switch(&svm->vcpu, tss_selector, reason); + if (int_type != SVM_EXITINTINFO_TYPE_SOFT) + int_vec = -1; + + return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, + has_error_code, error_code); } -static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int cpuid_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; kvm_emulate_cpuid(&svm->vcpu); return 1; } -static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int iret_interception(struct vcpu_svm *svm) { ++svm->vcpu.stat.nmi_window_exits; svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); svm->vcpu.arch.hflags |= HF_IRET_MASK; + svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu); return 1; } -static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int invlpg_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) - pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); + if (svm_has(SVM_FEATURE_DECODE_ASSIST)) { + kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1); + skip_emulated_instruction(&svm->vcpu); + } else { + if (emulate_instruction(&svm->vcpu, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); + } return 1; } -static int emulate_on_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int emulate_on_interception(struct vcpu_svm *svm) { - if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) + if (emulate_instruction(&svm->vcpu, 0) != EMULATE_DONE) pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); return 1; } -static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int rdpmc_interception(struct vcpu_svm *svm) +{ + int err; + + if (!static_cpu_has(X86_FEATURE_NRIPS)) + return emulate_on_interception(svm); + + err = kvm_rdpmc(&svm->vcpu); + if (err) + kvm_inject_gp(&svm->vcpu, 0); + else + skip_emulated_instruction(&svm->vcpu); + + return 1; +} + +#define CR_VALID (1ULL << 63) + +static int cr_interception(struct vcpu_svm *svm) +{ + int reg, cr; + unsigned long val; + int err; + + if (!svm_has(SVM_FEATURE_DECODE_ASSIST)) + return emulate_on_interception(svm); + + if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) + return emulate_on_interception(svm); + + reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; + cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; + + err = 0; + if (cr >= 16) { /* mov to cr */ + cr -= 16; + val = kvm_register_read(&svm->vcpu, reg); + switch (cr) { + case 0: + err = kvm_set_cr0(&svm->vcpu, val); + break; + case 3: + err = kvm_set_cr3(&svm->vcpu, val); + break; + case 4: + err = kvm_set_cr4(&svm->vcpu, val); + break; + case 8: + err = kvm_set_cr8(&svm->vcpu, val); + break; + default: + WARN(1, "unhandled write to CR%d", cr); + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + } else { /* mov from cr */ + switch (cr) { + case 0: + val = svm->vcpu.arch.cr0; + break; + case 2: + val = svm->vcpu.arch.cr2; + break; + case 3: + val = svm->vcpu.arch.cr3; + break; + case 4: + val = svm->vcpu.arch.cr4; + break; + case 8: + val = kvm_get_cr8(&svm->vcpu); + break; + default: + WARN(1, "unhandled read from CR%d", cr); + kvm_queue_exception(&svm->vcpu, UD_VECTOR); + return 1; + } + kvm_register_write(&svm->vcpu, reg, val); + } + if (err) + kvm_inject_gp(&svm->vcpu, 0); + else + skip_emulated_instruction(&svm->vcpu); + + return 1; +} + +static int dr_interception(struct vcpu_svm *svm) +{ + int reg, dr; + unsigned long val; + int err = 0; + + if (!svm_has(SVM_FEATURE_DECODE_ASSIST)) + return emulate_on_interception(svm); + + reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; + dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; + + if (dr >= 16) { /* mov to DRn */ + val = kvm_register_read(&svm->vcpu, reg); + svm_set_dr(&svm->vcpu, dr - 16, val, &err); + } else { + val = svm_get_dr(&svm->vcpu, dr); + kvm_register_write(&svm->vcpu, reg, val); + } + skip_emulated_instruction(&svm->vcpu); + + return 1; +} + +static int cr8_write_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + int r; + u8 cr8_prev = kvm_get_cr8(&svm->vcpu); /* instruction emulation calls kvm_set_cr8() */ - emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); + r = cr_interception(svm); if (irqchip_in_kernel(svm->vcpu.kvm)) { svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; - return 1; + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + return r; } if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) - return 1; + return r; kvm_run->exit_reason = KVM_EXIT_SET_TPR; return 0; } @@ -2066,7 +2549,7 @@ static int svm_get_msr(struct kvm_vcpu * else tsc_offset = svm->vmcb->control.tsc_offset; - *data = tsc_offset + native_read_tsc(); + *data = tsc_offset + svm_scale_tsc(vcpu, native_read_tsc()); break; } case MSR_K6_STAR: @@ -2128,7 +2611,7 @@ static int svm_get_msr(struct kvm_vcpu * return 0; } -static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int rdmsr_interception(struct vcpu_svm *svm) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data; @@ -2146,25 +2629,16 @@ static int rdmsr_interception(struct vcp return 1; } -static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) +static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct vcpu_svm *svm = to_svm(vcpu); + u32 ecx = msr->index; + u64 data = msr->data; switch (ecx) { - case MSR_IA32_TSC: { - u64 tsc_offset = data - native_read_tsc(); - u64 g_tsc_offset = 0; - - if (is_nested(svm)) { - g_tsc_offset = svm->vmcb->control.tsc_offset - - svm->nested.hsave->control.tsc_offset; - svm->nested.hsave->control.tsc_offset = tsc_offset; - } - - svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset; - + case MSR_IA32_TSC: + kvm_write_tsc(vcpu, msr); break; - } case MSR_K6_STAR: svm->vmcb->save.star = data; break; @@ -2203,6 +2677,7 @@ static int svm_set_msr(struct kvm_vcpu * return 1; svm->vmcb->save.dbgctl = data; + mark_dirty(svm->vmcb, VMCB_LBR); if (data & (1ULL<<0)) svm_enable_lbrv(svm); else @@ -2216,40 +2691,47 @@ static int svm_set_msr(struct kvm_vcpu * pr_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; default: - return kvm_set_msr_common(vcpu, ecx, data); + return kvm_set_msr_common(vcpu, msr); } return 0; } -static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int wrmsr_interception(struct vcpu_svm *svm) { + struct msr_data msr; u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); trace_kvm_msr_write(ecx, data); + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - if (svm_set_msr(&svm->vcpu, ecx, data)) + if (svm_set_msr(&svm->vcpu, &msr)) kvm_inject_gp(&svm->vcpu, 0); else skip_emulated_instruction(&svm->vcpu); return 1; } -static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +static int msr_interception(struct vcpu_svm *svm) { if (svm->vmcb->control.exit_info_1) - return wrmsr_interception(svm, kvm_run); + return wrmsr_interception(svm); else - return rdmsr_interception(svm, kvm_run); + return rdmsr_interception(svm); } -static int interrupt_window_interception(struct vcpu_svm *svm, - struct kvm_run *kvm_run) +static int interrupt_window_interception(struct vcpu_svm *svm) { + struct kvm_run *kvm_run = svm->vcpu.run; + svm_clear_vintr(svm); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; + mark_dirty(svm->vmcb, VMCB_INTR); /* * If the user space waits to inject interrupts, exit as soon as * possible @@ -2265,27 +2747,38 @@ static int interrupt_window_interception return 1; } -static int (*svm_exit_handlers[])(struct vcpu_svm *svm, - struct kvm_run *kvm_run) = { - [SVM_EXIT_READ_CR0] = emulate_on_interception, - [SVM_EXIT_READ_CR3] = emulate_on_interception, - [SVM_EXIT_READ_CR4] = emulate_on_interception, - [SVM_EXIT_READ_CR8] = emulate_on_interception, +static int pause_interception(struct vcpu_svm *svm) +{ + kvm_vcpu_on_spin(&(svm->vcpu)); + return 1; +} + +static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { + [SVM_EXIT_READ_CR0] = cr_interception, + [SVM_EXIT_READ_CR3] = cr_interception, + [SVM_EXIT_READ_CR4] = cr_interception, + [SVM_EXIT_READ_CR8] = cr_interception, /* for now: */ - [SVM_EXIT_WRITE_CR0] = emulate_on_interception, - [SVM_EXIT_WRITE_CR3] = emulate_on_interception, - [SVM_EXIT_WRITE_CR4] = emulate_on_interception, + [SVM_EXIT_WRITE_CR0] = cr_interception, + [SVM_EXIT_WRITE_CR3] = cr_interception, + [SVM_EXIT_WRITE_CR4] = cr_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, - [SVM_EXIT_READ_DR0] = emulate_on_interception, - [SVM_EXIT_READ_DR1] = emulate_on_interception, - [SVM_EXIT_READ_DR2] = emulate_on_interception, - [SVM_EXIT_READ_DR3] = emulate_on_interception, - [SVM_EXIT_WRITE_DR0] = emulate_on_interception, - [SVM_EXIT_WRITE_DR1] = emulate_on_interception, - [SVM_EXIT_WRITE_DR2] = emulate_on_interception, - [SVM_EXIT_WRITE_DR3] = emulate_on_interception, - [SVM_EXIT_WRITE_DR5] = emulate_on_interception, - [SVM_EXIT_WRITE_DR7] = emulate_on_interception, + [SVM_EXIT_READ_DR0] = dr_interception, + [SVM_EXIT_READ_DR1] = dr_interception, + [SVM_EXIT_READ_DR2] = dr_interception, + [SVM_EXIT_READ_DR3] = dr_interception, + [SVM_EXIT_READ_DR4] = dr_interception, + [SVM_EXIT_READ_DR5] = dr_interception, + [SVM_EXIT_READ_DR6] = dr_interception, + [SVM_EXIT_READ_DR7] = dr_interception, + [SVM_EXIT_WRITE_DR0] = dr_interception, + [SVM_EXIT_WRITE_DR1] = dr_interception, + [SVM_EXIT_WRITE_DR2] = dr_interception, + [SVM_EXIT_WRITE_DR3] = dr_interception, + [SVM_EXIT_WRITE_DR4] = dr_interception, + [SVM_EXIT_WRITE_DR5] = dr_interception, + [SVM_EXIT_WRITE_DR6] = dr_interception, + [SVM_EXIT_WRITE_DR7] = dr_interception, [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, @@ -2297,10 +2790,12 @@ static int (*svm_exit_handlers[])(struct [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, + [SVM_EXIT_RDPMC] = rdpmc_interception, /* [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, */ [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_IRET] = iret_interception, [SVM_EXIT_INVD] = emulate_on_interception, + [SVM_EXIT_PAUSE] = pause_interception, [SVM_EXIT_HLT] = halt_interception, [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invlpga_interception, @@ -2318,15 +2813,24 @@ static int (*svm_exit_handlers[])(struct [SVM_EXIT_WBINVD] = emulate_on_interception, [SVM_EXIT_MONITOR] = invalid_op_interception, [SVM_EXIT_MWAIT] = invalid_op_interception, + [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, }; -static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, svm->vmcb->save.rip); + trace_kvm_exit(exit_code, svm->vmcb->save.rip, KVM_ISA_SVM); + + if (unlikely(svm->nested.exit_required)) { + nested_svm_vmexit(svm); + svm->nested.exit_required = false; + + return 1; + } if (is_nested(svm)) { int vmexit; @@ -2348,7 +2852,8 @@ static int handle_exit(struct kvm_run *k if (npt_enabled) { int mmu_reload = 0; - if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { + if ((kvm_read_cr0_bits(vcpu, X86_CR0_PG) ^ svm->vmcb->save.cr0) + & X86_CR0_PG) { svm_set_cr0(vcpu, svm->vmcb->save.cr0); mmu_reload = 1; } @@ -2383,7 +2888,7 @@ static int handle_exit(struct kvm_run *k return 0; } - return svm_exit_handlers[exit_code](svm, kvm_run); + return svm_exit_handlers[exit_code](svm); } static void reload_tss(struct kvm_vcpu *vcpu) @@ -2401,7 +2906,6 @@ static void pre_svm_run(struct vcpu_svm struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); - svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; /* FIXME: handle wraparound of asid_generation */ if (svm->asid_generation != svm_data->asid_generation) new_asid(svm, svm_data); @@ -2414,6 +2918,7 @@ static void svm_inject_nmi(struct kvm_vc svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; vcpu->arch.hflags |= HF_NMI_MASK; svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); ++vcpu->stat.nmi_injections; } @@ -2421,14 +2926,12 @@ static inline void svm_inject_irq(struct { struct vmcb_control_area *control; - trace_kvm_inj_virq(irq); - - ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; control->int_ctl |= V_IRQ_MASK | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); + mark_dirty(svm->vmcb, VMCB_INTR); } static void svm_set_irq(struct kvm_vcpu *vcpu) @@ -2437,6 +2940,9 @@ static void svm_set_irq(struct kvm_vcpu BUG_ON(!(gif_set(svm))); + trace_kvm_inj_virq(vcpu->arch.interrupt.nr); + ++vcpu->stat.irq_injections; + svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR; } @@ -2448,8 +2954,10 @@ static void update_cr8_intercept(struct if (irr == -1) return; - if (tpr >= irr) + if (tpr >= irr) { svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + } } static int svm_nmi_allowed(struct kvm_vcpu *vcpu) @@ -2460,6 +2968,27 @@ static int svm_nmi_allowed(struct kvm_vc !(svm->vcpu.arch.hflags & HF_NMI_MASK); } +static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + return !!(svm->vcpu.arch.hflags & HF_NMI_MASK); +} + +static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (masked) { + svm->vcpu.arch.hflags |= HF_NMI_MASK; + svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); + } else { + svm->vcpu.arch.hflags &= ~HF_NMI_MASK; + svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); + } + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); +} + static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -2510,11 +3039,18 @@ static int svm_set_tss_addr(struct kvm * static void svm_flush_tlb(struct kvm_vcpu *vcpu) { - force_new_asid(vcpu); + struct vcpu_svm *svm = to_svm(vcpu); + + if (svm_has(SVM_FEATURE_FLUSH_ASID)) + svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; + else + svm->asid_generation--; } static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { + if (npt_enabled) + vcpu->fpu_active = 1; } static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) @@ -2535,6 +3071,7 @@ static inline void sync_lapic_to_cr8(str cr8 = kvm_get_cr8(vcpu); svm->vmcb->control.int_ctl &= ~V_TPR_MASK; svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; + mark_dirty(svm->vmcb, VMCB_INTR); } static void svm_complete_interrupts(struct vcpu_svm *svm) @@ -2543,7 +3080,12 @@ static void svm_complete_interrupts(stru int type; u32 exitintinfo = svm->vmcb->control.exit_int_info; - if (svm->vcpu.arch.hflags & HF_IRET_MASK) + /* + * If we've made progress since setting HF_IRET_MASK, we've + * executed an IRET and can allow NMI injection. + */ + if ((svm->vcpu.arch.hflags & HF_IRET_MASK) + && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); svm->vcpu.arch.nmi_injected = false; @@ -2588,13 +3130,20 @@ static void svm_complete_interrupts(stru #define R "e" #endif -static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void svm_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u16 fs_selector; u16 gs_selector; u16 ldt_selector; + /* + * A vmexit emulation is required before the vcpu can be executed + * again. + */ + if (unlikely(svm->nested.exit_required)) + return; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; @@ -2604,13 +3153,15 @@ static void svm_vcpu_run(struct kvm_vcpu sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = kvm_read_fs(); - gs_selector = kvm_read_gs(); + savesegment(fs, fs_selector); + savesegment(gs, gs_selector); ldt_selector = kvm_read_ldt(); svm->vmcb->save.cr2 = vcpu->arch.cr2; /* required for live migration with NPT */ - if (npt_enabled) + if (npt_enabled) { svm->vmcb->save.cr3 = vcpu->arch.cr3; + mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR); + } clgi(); @@ -2692,25 +3243,50 @@ static void svm_vcpu_run(struct kvm_vcpu vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; - kvm_load_fs(fs_selector); - kvm_load_gs(gs_selector); - kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); + loadsegment(fs, fs_selector); +#ifdef CONFIG_X86_64 + load_gs_index(gs_selector); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, gs_selector); +#endif + kvm_load_ldt(ldt_selector); reload_tss(vcpu); local_irq_disable(); + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_before_handle_nmi(&svm->vcpu); + stgi(); + /* Any pending NMI will happen here */ + + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) + kvm_after_handle_nmi(&svm->vcpu); + sync_cr8_to_lapic(vcpu); svm->next_rip = 0; + svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; + if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); } + + /* + * We need to handle MC intercepts here before the vcpu has a chance to + * change the physical cpu + */ + if (unlikely(svm->vmcb->control.exit_code == + SVM_EXIT_EXCP_BASE + MC_VECTOR)) + svm_handle_mce(svm); + + mark_all_clean(svm->vmcb); } #undef R @@ -2721,18 +3297,14 @@ static void svm_set_cr3(struct kvm_vcpu if (npt_enabled) { svm->vmcb->control.nested_cr3 = root; - force_new_asid(vcpu); + mark_dirty(svm->vmcb, VMCB_NPT); + svm_flush_tlb(vcpu); return; } svm->vmcb->save.cr3 = root; - force_new_asid(vcpu); - - if (vcpu->fpu_active) { - svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR); - svm->vmcb->save.cr0 |= X86_CR0_TS; - vcpu->fpu_active = 0; - } + mark_dirty(svm->vmcb, VMCB_CR); + svm_flush_tlb(vcpu); } static int is_disabled(void) @@ -2781,62 +3353,38 @@ static u64 svm_get_mt_mask(struct kvm_vc return 0; } -static const struct trace_print_flags svm_exit_reasons_str[] = { - { SVM_EXIT_READ_CR0, "read_cr0" }, - { SVM_EXIT_READ_CR3, "read_cr3" }, - { SVM_EXIT_READ_CR4, "read_cr4" }, - { SVM_EXIT_READ_CR8, "read_cr8" }, - { SVM_EXIT_WRITE_CR0, "write_cr0" }, - { SVM_EXIT_WRITE_CR3, "write_cr3" }, - { SVM_EXIT_WRITE_CR4, "write_cr4" }, - { SVM_EXIT_WRITE_CR8, "write_cr8" }, - { SVM_EXIT_READ_DR0, "read_dr0" }, - { SVM_EXIT_READ_DR1, "read_dr1" }, - { SVM_EXIT_READ_DR2, "read_dr2" }, - { SVM_EXIT_READ_DR3, "read_dr3" }, - { SVM_EXIT_WRITE_DR0, "write_dr0" }, - { SVM_EXIT_WRITE_DR1, "write_dr1" }, - { SVM_EXIT_WRITE_DR2, "write_dr2" }, - { SVM_EXIT_WRITE_DR3, "write_dr3" }, - { SVM_EXIT_WRITE_DR5, "write_dr5" }, - { SVM_EXIT_WRITE_DR7, "write_dr7" }, - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, - { SVM_EXIT_INTR, "interrupt" }, - { SVM_EXIT_NMI, "nmi" }, - { SVM_EXIT_SMI, "smi" }, - { SVM_EXIT_INIT, "init" }, - { SVM_EXIT_VINTR, "vintr" }, - { SVM_EXIT_CPUID, "cpuid" }, - { SVM_EXIT_INVD, "invd" }, - { SVM_EXIT_HLT, "hlt" }, - { SVM_EXIT_INVLPG, "invlpg" }, - { SVM_EXIT_INVLPGA, "invlpga" }, - { SVM_EXIT_IOIO, "io" }, - { SVM_EXIT_MSR, "msr" }, - { SVM_EXIT_TASK_SWITCH, "task_switch" }, - { SVM_EXIT_SHUTDOWN, "shutdown" }, - { SVM_EXIT_VMRUN, "vmrun" }, - { SVM_EXIT_VMMCALL, "hypercall" }, - { SVM_EXIT_VMLOAD, "vmload" }, - { SVM_EXIT_VMSAVE, "vmsave" }, - { SVM_EXIT_STGI, "stgi" }, - { SVM_EXIT_CLGI, "clgi" }, - { SVM_EXIT_SKINIT, "skinit" }, - { SVM_EXIT_WBINVD, "wbinvd" }, - { SVM_EXIT_MONITOR, "monitor" }, - { SVM_EXIT_MWAIT, "mwait" }, - { SVM_EXIT_NPF, "npf" }, - { -1, NULL } -}; +static void svm_cpuid_update(struct kvm_vcpu *vcpu) +{ +} -static bool svm_gb_page_enable(void) +static bool svm_invpcid_supported(void) +{ + return false; +} + +static int svm_get_lpage_level(void) +{ + return PT_PDPE_LEVEL; +} + +static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + + if (npt_enabled) { + /* hack: npt requires active fpu at this time */ + vcpu->fpu_active = 1; + return; + } + + svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; + svm->vmcb->save.cr0 |= X86_CR0_TS; + mark_dirty(svm->vmcb, VMCB_INTERCEPTS); + mark_dirty(svm->vmcb, VMCB_CR); +} + +static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) { - return true; } static struct kvm_x86_ops svm_x86_ops = { @@ -2865,6 +3413,7 @@ static struct kvm_x86_ops svm_x86_ops = .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, + .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr3 = svm_set_cr3, @@ -2879,6 +3428,7 @@ static struct kvm_x86_ops svm_x86_ops = .cache_reg = svm_cache_reg, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, + .fpu_deactivate = svm_fpu_deactivate, .tlb_flush = svm_flush_tlb, @@ -2893,6 +3443,8 @@ static struct kvm_x86_ops svm_x86_ops = .queue_exception = svm_queue_exception, .interrupt_allowed = svm_interrupt_allowed, .nmi_allowed = svm_nmi_allowed, + .get_nmi_mask = svm_get_nmi_mask, + .set_nmi_mask = svm_set_nmi_mask, .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, @@ -2901,14 +3453,23 @@ static struct kvm_x86_ops svm_x86_ops = .get_tdp_level = get_npt_level, .get_mt_mask = svm_get_mt_mask, - .exit_reasons_str = svm_exit_reasons_str, - .gb_page_enable = svm_gb_page_enable, + .get_lpage_level = svm_get_lpage_level, + + .cpuid_update = svm_cpuid_update, + .invpcid_supported = svm_invpcid_supported, + + .set_tsc_khz = svm_set_tsc_khz, + .write_tsc_offset = svm_write_tsc_offset, + .adjust_tsc_offset = svm_adjust_tsc_offset, + .compute_tsc_offset = svm_compute_tsc_offset, + + .sched_in = svm_sched_in, }; static int __init svm_init(void) { return kvm_init(&svm_x86_ops, sizeof(struct vcpu_svm), - THIS_MODULE); + __alignof__(struct vcpu_svm), THIS_MODULE); } static void __exit svm_exit(void) diff -uprN linux-2.6.32/arch/x86/kvm/trace.h linux-2.6.32.ovz/arch/x86/kvm/trace.h --- linux-2.6.32/arch/x86/kvm/trace.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/trace.h 2015-03-10 13:24:14.000000000 -0700 @@ -2,6 +2,8 @@ #define _TRACE_KVM_H #include +#include +#include #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -148,26 +150,32 @@ TRACE_EVENT(kvm_apic, #define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) #define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) +#define KVM_ISA_VMX 1 +#define KVM_ISA_SVM 2 + /* * Tracepoint for kvm guest exit: */ TRACE_EVENT(kvm_exit, - TP_PROTO(unsigned int exit_reason, unsigned long guest_rip), - TP_ARGS(exit_reason, guest_rip), + TP_PROTO(unsigned int exit_reason, unsigned long guest_rip, u32 isa), + TP_ARGS(exit_reason, guest_rip, isa), TP_STRUCT__entry( __field( unsigned int, exit_reason ) __field( unsigned long, guest_rip ) + __field( u32, isa ) ), TP_fast_assign( __entry->exit_reason = exit_reason; __entry->guest_rip = guest_rip; + __entry->isa = isa; ), TP_printk("reason %s rip 0x%lx", - ftrace_print_symbols_seq(p, __entry->exit_reason, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), __entry->guest_rip) ); @@ -189,6 +197,38 @@ TRACE_EVENT(kvm_inj_virq, TP_printk("irq %u", __entry->irq) ); +#define EXS(x) { x##_VECTOR, "#" #x } + +#define kvm_trace_sym_exc \ + EXS(DE), EXS(DB), EXS(BP), EXS(OF), EXS(BR), EXS(UD), EXS(NM), \ + EXS(DF), EXS(TS), EXS(NP), EXS(SS), EXS(GP), EXS(PF), \ + EXS(MF), EXS(MC) + +/* + * Tracepoint for kvm interrupt injection: + */ +TRACE_EVENT(kvm_inj_exception, + TP_PROTO(unsigned exception, bool has_error, unsigned error_code), + TP_ARGS(exception, has_error, error_code), + + TP_STRUCT__entry( + __field( u8, exception ) + __field( u8, has_error ) + __field( u32, error_code ) + ), + + TP_fast_assign( + __entry->exception = exception; + __entry->has_error = has_error; + __entry->error_code = error_code; + ), + + TP_printk("%s (0x%x)", + __print_symbolic(__entry->exception, kvm_trace_sym_exc), + /* FIXME: don't print error_code if not present */ + __entry->has_error ? __entry->error_code : 0) +); + /* * Tracepoint for page fault. */ @@ -349,6 +389,91 @@ TRACE_EVENT(kvm_apic_accept_irq, __entry->coalesced ? " (coalesced)" : "") ); +TRACE_EVENT(kvm_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_pv_eoi, + TP_PROTO(struct kvm_lapic *apic, int vector), + TP_ARGS(apic, vector), + + TP_STRUCT__entry( + __field( __u32, apicid ) + __field( int, vector ) + ), + + TP_fast_assign( + __entry->apicid = apic->vcpu->vcpu_id; + __entry->vector = vector; + ), + + TP_printk("apicid %x vector %d", __entry->apicid, __entry->vector) +); + +TRACE_EVENT(kvm_write_tsc_offset, + TP_PROTO(unsigned int vcpu_id, __u64 previous_tsc_offset, + __u64 next_tsc_offset), + TP_ARGS(vcpu_id, previous_tsc_offset, next_tsc_offset), + + TP_STRUCT__entry( + __field( unsigned int, vcpu_id ) + __field( __u64, previous_tsc_offset ) + __field( __u64, next_tsc_offset ) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->previous_tsc_offset = previous_tsc_offset; + __entry->next_tsc_offset = next_tsc_offset; + ), + + TP_printk("vcpu=%u prev=%llu next=%llu", __entry->vcpu_id, + __entry->previous_tsc_offset, __entry->next_tsc_offset) +); + +TRACE_EVENT(kvm_ple_window, + TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old), + TP_ARGS(grow, vcpu_id, new, old), + + TP_STRUCT__entry( + __field( bool, grow ) + __field( unsigned int, vcpu_id ) + __field( int, new ) + __field( int, old ) + ), + + TP_fast_assign( + __entry->grow = grow; + __entry->vcpu_id = vcpu_id; + __entry->new = new; + __entry->old = old; + ), + + TP_printk("vcpu %u: ple_window %d (%s %d)", + __entry->vcpu_id, + __entry->new, + __entry->grow ? "grow" : "shrink", + __entry->old) +); + +#define trace_kvm_ple_window_grow(vcpu_id, new, old) \ + trace_kvm_ple_window(true, vcpu_id, new, old) +#define trace_kvm_ple_window_shrink(vcpu_id, new, old) \ + trace_kvm_ple_window(false, vcpu_id, new, old) + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff -uprN linux-2.6.32/arch/x86/kvm/vmx.c linux-2.6.32.ovz/arch/x86/kvm/vmx.c --- linux-2.6.32/arch/x86/kvm/vmx.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/vmx.c 2015-03-10 13:24:15.000000000 -0700 @@ -26,6 +26,7 @@ #include #include #include +#include #include "kvm_cache_regs.h" #include "x86.h" @@ -34,6 +35,10 @@ #include #include #include +#include +#include +#include +#include #include "trace.h" @@ -58,15 +63,68 @@ static int __read_mostly enable_unrestri module_param_named(unrestricted_guest, enable_unrestricted_guest, bool, S_IRUGO); +static bool __read_mostly enable_ept_ad_bits = 1; +module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); + static int __read_mostly emulate_invalid_guest_state = 0; module_param(emulate_invalid_guest_state, bool, S_IRUGO); +static int __read_mostly yield_on_hlt = 1; +module_param(yield_on_hlt, bool, S_IRUGO); + +static int __read_mostly vmm_exclusive = 1; +module_param(vmm_exclusive, bool, S_IRUGO); + +/* + * These 2 parameters are used to config the controls for Pause-Loop Exiting: + * ple_gap: upper bound on the amount of time between two successive + * executions of PAUSE in a loop. Also indicate if ple enabled. + * According to test, this time is usually smaller than 128 cycles. + * ple_window: upper bound on the amount of time a guest is allowed to execute + * in a PAUSE loop. Tests indicate that most spinlocks are held for + * less than 2^12 cycles + * Time is measured based on a counter that runs at the same rate as the TSC, + * refer SDM volume 3b section 21.6.13 & 22.1.3. + */ +#define KVM_VMX_DEFAULT_PLE_GAP 128 +#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 +#define KVM_VMX_DEFAULT_PLE_WINDOW_GROW 2 +#define KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK 0 +#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \ + INT_MAX / KVM_VMX_DEFAULT_PLE_WINDOW_GROW +static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; +module_param(ple_gap, int, S_IRUGO); + +static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW; +module_param(ple_window, int, S_IRUGO); + +/* Default doubles per-vcpu window every exit. */ +static int ple_window_grow = KVM_VMX_DEFAULT_PLE_WINDOW_GROW; +module_param(ple_window_grow, int, S_IRUGO); + +/* Default resets per-vcpu window every exit to ple_window. */ +static int ple_window_shrink = KVM_VMX_DEFAULT_PLE_WINDOW_SHRINK; +module_param(ple_window_shrink, int, S_IRUGO); + +/* Default is to compute the maximum so we can never overflow. */ +static int ple_window_actual_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +static int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; +module_param(ple_window_max, int, S_IRUGO); + +#define NR_AUTOLOAD_MSRS 8 + struct vmcs { u32 revision_id; u32 abort; char data[0]; }; +struct shared_msr_entry { + unsigned index; + u64 data; + u64 mask; +}; + struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; @@ -74,25 +132,30 @@ struct vcpu_vmx { int launched; u8 fail; u32 idt_vectoring_info; - struct kvm_msr_entry *guest_msrs; - struct kvm_msr_entry *host_msrs; + struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; int msr_offset_efer; #ifdef CONFIG_X86_64 - int msr_offset_kernel_gs_base; + u64 msr_host_kernel_gs_base; + u64 msr_guest_kernel_gs_base; #endif struct vmcs *vmcs; + struct msr_autoload { + unsigned nr; + struct vmx_msr_entry guest[NR_AUTOLOAD_MSRS]; + struct vmx_msr_entry host[NR_AUTOLOAD_MSRS]; + } msr_autoload; struct { int loaded; u16 fs_sel, gs_sel, ldt_sel; int gs_ldt_reload_needed; int fs_reload_needed; - int guest_efer_loaded; + unsigned long vmcs_host_cr4; /* May not match real cr4 */ } host_state; struct { int vm86_active; - u8 save_iopl; + ulong save_rflags; struct kvm_save_segment { u16 selector; unsigned long base; @@ -114,6 +177,10 @@ struct vcpu_vmx { ktime_t entry_time; s64 vnmi_blocked_time; u32 exit_reason; + + /* Dynamic PLE window. */ + int ple_window; + bool ple_window_dirty; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) @@ -121,8 +188,14 @@ static inline struct vcpu_vmx *to_vmx(st return container_of(vcpu, struct vcpu_vmx, vcpu); } -static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); +static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); +static void kvm_cpu_vmxon(u64 addr); +static void kvm_cpu_vmxoff(void); +static void vmx_set_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); +static void vmx_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -133,6 +206,8 @@ static unsigned long *vmx_io_bitmap_b; static unsigned long *vmx_msr_bitmap_legacy; static unsigned long *vmx_msr_bitmap_longmode; +static bool cpu_has_load_perf_global_ctrl; + static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); static DEFINE_SPINLOCK(vmx_vpid_lock); @@ -176,36 +251,24 @@ static struct kvm_vmx_segment_field { VMX_SEGMENT_FIELD(LDTR), }; +static u64 host_efer; + static void ept_save_pdptrs(struct kvm_vcpu *vcpu); +#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM)) + /* * Keep MSR_K6_STAR at the end, as setup_msrs() will try to optimize it * away by decrementing the array size. */ static const u32 vmx_msr_index[] = { #ifdef CONFIG_X86_64 - MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, + MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, #endif MSR_EFER, MSR_K6_STAR, }; #define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) -static void load_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - wrmsrl(e[i].index, e[i].data); -} - -static void save_msrs(struct kvm_msr_entry *e, int n) -{ - int i; - - for (i = 0; i < n; ++i) - rdmsrl(e[i].index, e[i].data); -} - static inline int is_page_fault(u32 intr_info) { return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | @@ -293,6 +356,16 @@ static inline bool cpu_has_vmx_ept_2m_pa return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); } +static inline bool cpu_has_vmx_ept_ad_bits(void) +{ + return vmx_capability.ept & VMX_EPT_AD_BIT; +} + +static inline bool cpu_has_vmx_ept_1g_page(void) +{ + return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT); +} + static inline int cpu_has_vmx_invept_individual_addr(void) { return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); @@ -320,6 +393,12 @@ static inline int cpu_has_vmx_unrestrict SECONDARY_EXEC_UNRESTRICTED_GUEST; } +static inline int cpu_has_vmx_ple(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_PAUSE_LOOP_EXITING; +} + static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) { return flexpriority_enabled && @@ -333,6 +412,17 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID; } +static inline bool cpu_has_vmx_invpcid(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_ENABLE_INVPCID; +} + +static bool vmx_invpcid_supported(void) +{ + return cpu_has_vmx_invpcid() && enable_ept; +} + static inline int cpu_has_virtual_nmis(void) { return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; @@ -348,7 +438,7 @@ static int __find_msr_index(struct vcpu_ int i; for (i = 0; i < vmx->nmsrs; ++i) - if (vmx->guest_msrs[i].index == msr) + if (vmx_msr_index[vmx->guest_msrs[i].index] == msr) return i; return -1; } @@ -379,7 +469,7 @@ static inline void __invept(int ext, u64 : : "a" (&operand), "c" (ext) : "cc", "memory"); } -static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) +static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -402,6 +492,59 @@ static void vmcs_clear(struct vmcs *vmcs vmcs, phys_addr); } +static void vmcs_load(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(vmcs); + u8 error; + + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" + : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) + : "cc", "memory"); + if (error) + printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", + vmcs, phys_addr); +} + +#ifdef CONFIG_KEXEC +/* + * This bitmap is used to indicate whether the vmclear + * operation is enabled on all cpus. All disabled by + * default. + */ +static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE; + +static inline void crash_enable_local_vmclear(int cpu) +{ + cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline void crash_disable_local_vmclear(int cpu) +{ + cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static inline int crash_local_vmclear_enabled(int cpu) +{ + return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap); +} + +static void crash_vmclear_local_loaded_vmcss(void) +{ + int cpu = raw_smp_processor_id(); + struct vcpu_vmx *vmx; + + if (!crash_local_vmclear_enabled(cpu)) + return; + + list_for_each_entry(vmx, &per_cpu(vcpus_on_cpu, cpu), + local_vcpus_link) + vmcs_clear(vmx->vmcs); +} +#else +static inline void crash_enable_local_vmclear(int cpu) { } +static inline void crash_disable_local_vmclear(int cpu) { } +#endif /* CONFIG_KEXEC */ + static void __vcpu_clear(void *arg) { struct vcpu_vmx *vmx = arg; @@ -411,10 +554,11 @@ static void __vcpu_clear(void *arg) vmcs_clear(vmx->vmcs); if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; - rdtscll(vmx->vcpu.arch.host_tsc); + crash_disable_local_vmclear(cpu); list_del(&vmx->local_vcpus_link); vmx->vcpu.cpu = -1; vmx->launched = 0; + crash_enable_local_vmclear(cpu); } static void vcpu_clear(struct vcpu_vmx *vmx) @@ -537,9 +681,8 @@ static void update_exception_bitmap(stru { u32 eb; - eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); - if (!vcpu->fpu_active) - eb |= 1u << NM_VECTOR; + eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) + | (1u << NM_VECTOR); /* * Unconditionally intercept #DB so we can maintain dr6 without * reading it every exit. @@ -553,9 +696,97 @@ static void update_exception_bitmap(stru eb = ~0; if (enable_ept) eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ + if (vcpu->fpu_active) + eb &= ~(1u << NM_VECTOR); vmcs_write32(EXCEPTION_BITMAP, eb); } +static void clear_atomic_switch_msr_special(unsigned long entry, + unsigned long exit) +{ + vmcs_clear_bits(VM_ENTRY_CONTROLS, entry); + vmcs_clear_bits(VM_EXIT_CONTROLS, exit); +} + +static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr) +{ + unsigned i; + struct msr_autoload *m = &vmx->msr_autoload; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl) { + clear_atomic_switch_msr_special( + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + return; + } + break; + } + + for (i = 0; i < m->nr; ++i) + if (m->guest[i].index == msr) + break; + + if (i == m->nr) + return; + --m->nr; + m->guest[i] = m->guest[m->nr]; + m->host[i] = m->host[m->nr]; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); +} + +static void add_atomic_switch_msr_special(unsigned long entry, + unsigned long exit, unsigned long guest_val_vmcs, + unsigned long host_val_vmcs, u64 guest_val, u64 host_val) +{ + vmcs_write64(guest_val_vmcs, guest_val); + vmcs_write64(host_val_vmcs, host_val); + vmcs_set_bits(VM_ENTRY_CONTROLS, entry); + vmcs_set_bits(VM_EXIT_CONTROLS, exit); +} + +static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr, + u64 guest_val, u64 host_val) +{ + unsigned i; + struct msr_autoload *m = &vmx->msr_autoload; + + switch (msr) { + case MSR_CORE_PERF_GLOBAL_CTRL: + if (cpu_has_load_perf_global_ctrl) { + add_atomic_switch_msr_special( + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL, + GUEST_IA32_PERF_GLOBAL_CTRL, + HOST_IA32_PERF_GLOBAL_CTRL, + guest_val, host_val); + return; + } + break; + } + + for (i = 0; i < m->nr; ++i) + if (m->guest[i].index == msr) + break; + + if (i == NR_AUTOLOAD_MSRS) { + printk_once(KERN_WARNING"Not enough mst switch entries. " + "Can't add msr %x\n", msr); + return; + } else if (i == m->nr) { + ++m->nr; + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->nr); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->nr); + } + + m->guest[i].index = msr; + m->guest[i].value = guest_val; + m->host[i].index = msr; + m->host[i].value = host_val; +} + static void reload_tss(void) { /* @@ -570,17 +801,15 @@ static void reload_tss(void) load_TR_desc(); } -static void load_transition_efer(struct vcpu_vmx *vmx) +static bool update_transition_efer(struct vcpu_vmx *vmx) { int efer_offset = vmx->msr_offset_efer; - u64 host_efer; u64 guest_efer; u64 ignore_bits; if (efer_offset < 0) - return; - host_efer = vmx->host_msrs[efer_offset].data; - guest_efer = vmx->guest_msrs[efer_offset].data; + return false; + guest_efer = vmx->vcpu.arch.shadow_efer; /* * NX is emulated; LMA and LME handled by hardware; SCE meaninless @@ -593,27 +822,17 @@ static void load_transition_efer(struct if (guest_efer & EFER_LMA) ignore_bits &= ~(u64)EFER_SCE; #endif - if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits)) - return; - - vmx->host_state.guest_efer_loaded = 1; guest_efer &= ~ignore_bits; guest_efer |= host_efer & ignore_bits; - wrmsrl(MSR_EFER, guest_efer); - vmx->vcpu.stat.efer_reload++; -} - -static void reload_host_efer(struct vcpu_vmx *vmx) -{ - if (vmx->host_state.guest_efer_loaded) { - vmx->host_state.guest_efer_loaded = 0; - load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1); - } + vmx->guest_msrs[efer_offset].data = guest_efer; + vmx->guest_msrs[efer_offset].mask = ~ignore_bits; + return true; } static void vmx_save_host_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + int i; if (vmx->host_state.loaded) return; @@ -625,7 +844,7 @@ static void vmx_save_host_state(struct k */ vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = kvm_read_fs(); + savesegment(fs, vmx->host_state.fs_sel); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -633,7 +852,7 @@ static void vmx_save_host_state(struct k vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = kvm_read_gs(); + savesegment(gs, vmx->host_state.gs_sel); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -650,43 +869,42 @@ static void vmx_save_host_state(struct k #endif #ifdef CONFIG_X86_64 - if (is_long_mode(&vmx->vcpu)) - save_msrs(vmx->host_msrs + - vmx->msr_offset_kernel_gs_base, 1); - + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + } #endif - load_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_transition_efer(vmx); + for (i = 0; i < vmx->save_nmsrs; ++i) + kvm_set_shared_msr(vmx->guest_msrs[i].index, + vmx->guest_msrs[i].data, + vmx->guest_msrs[i].mask); } static void __vmx_load_host_state(struct vcpu_vmx *vmx) { - unsigned long flags; - if (!vmx->host_state.loaded) return; ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) - kvm_load_fs(vmx->host_state.fs_sel); + loadsegment(fs, vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { kvm_load_ldt(vmx->host_state.ldt_sel); - /* - * If we have to reload gs, we must take care to - * preserve our gs base. - */ - local_irq_save(flags); - kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 - wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); + load_gs_index(vmx->host_state.gs_sel); + wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs); +#else + loadsegment(gs, vmx->host_state.gs_sel); #endif - local_irq_restore(flags); } reload_tss(); - save_msrs(vmx->guest_msrs, vmx->save_nmsrs); - load_msrs(vmx->host_msrs, vmx->save_nmsrs); - reload_host_efer(vmx); +#ifdef CONFIG_X86_64 + if (is_long_mode(&vmx->vcpu)) { + rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); + wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); + } +#endif } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -703,36 +921,30 @@ static void vmx_load_host_state(struct v static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(vmx->vmcs); - u64 tsc_this, delta, new_offset; + u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - if (vcpu->cpu != cpu) { + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + else if (vcpu->cpu != cpu) vcpu_clear(vmx); - kvm_migrate_timers(vcpu); - set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); - local_irq_disable(); - list_add(&vmx->local_vcpus_link, - &per_cpu(vcpus_on_cpu, cpu)); - local_irq_enable(); - } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { - u8 error; - per_cpu(current_vmcs, cpu) = vmx->vmcs; - asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" - : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) - : "cc"); - if (error) - printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", - vmx->vmcs, phys_addr); + vmcs_load(vmx->vmcs); } if (vcpu->cpu != cpu) { struct descriptor_table dt; unsigned long sysenter_esp; - vcpu->cpu = cpu; + set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests); + local_irq_disable(); + crash_disable_local_vmclear(cpu); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + crash_enable_local_vmclear(cpu); + local_irq_enable(); + /* * Linux uses per-cpu TSS and GDT, so set these when switching * processors. @@ -743,22 +955,16 @@ static void vmx_vcpu_load(struct kvm_vcp rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ - - /* - * Make sure the time stamp counter is monotonous. - */ - rdtscll(tsc_this); - if (tsc_this < vcpu->arch.host_tsc) { - delta = vcpu->arch.host_tsc - tsc_this; - new_offset = vmcs_read64(TSC_OFFSET) + delta; - vmcs_write64(TSC_OFFSET, new_offset); - } } } static void vmx_vcpu_put(struct kvm_vcpu *vcpu) { __vmx_load_host_state(to_vmx(vcpu)); + if (!vmm_exclusive) { + __vcpu_clear(to_vmx(vcpu)); + kvm_cpu_vmxoff(); + } } static void vmx_fpu_activate(struct kvm_vcpu *vcpu) @@ -767,34 +973,44 @@ static void vmx_fpu_activate(struct kvm_ return; vcpu->fpu_active = 1; vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); - if (vcpu->arch.cr0 & X86_CR0_TS) + if (kvm_read_cr0_bits(vcpu, X86_CR0_TS)) vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); + static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) { - if (!vcpu->fpu_active) - return; - vcpu->fpu_active = 0; + vmx_decache_cr0_guest_bits(vcpu); vmcs_set_bits(GUEST_CR0, X86_CR0_TS); update_exception_bitmap(vcpu); + vcpu->arch.cr0_guest_owned_bits = 0; + vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits); + vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); } static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { - unsigned long rflags; + unsigned long rflags, save_rflags; rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) - rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } return rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { - if (to_vmx(vcpu)->rmode.vm86_active) + if (to_vmx(vcpu)->rmode.vm86_active) { + to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; + } vmcs_writel(GUEST_RFLAGS, rflags); } @@ -839,6 +1055,17 @@ static void skip_emulated_instruction(st vmx_set_interrupt_shadow(vcpu, 0); } +static void vmx_clear_hlt(struct kvm_vcpu *vcpu) +{ + /* Ensure that we clear the HLT state in the VMCS. We don't need to + * explicitly skip the instruction because if the HLT state is set, then + * the instruction is already executing and RIP has already been + * advanced. */ + if (!yield_on_hlt && + vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); +} + static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code) { @@ -872,24 +1099,20 @@ static void vmx_queue_exception(struct k intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); + vmx_clear_hlt(vcpu); } /* * Swap MSR entry in host/guest MSR entry array. */ -#ifdef CONFIG_X86_64 static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) { - struct kvm_msr_entry tmp; + struct shared_msr_entry tmp; tmp = vmx->guest_msrs[to]; vmx->guest_msrs[to] = vmx->guest_msrs[from]; vmx->guest_msrs[from] = tmp; - tmp = vmx->host_msrs[to]; - vmx->host_msrs[to] = vmx->host_msrs[from]; - vmx->host_msrs[from] = tmp; } -#endif /* * Set up the vmcs to automatically save and restore system @@ -898,15 +1121,13 @@ static void move_msr_up(struct vcpu_vmx */ static void setup_msrs(struct vcpu_vmx *vmx) { - int save_nmsrs; + int save_nmsrs, index; unsigned long *msr_bitmap; vmx_load_host_state(vmx); save_nmsrs = 0; #ifdef CONFIG_X86_64 if (is_long_mode(&vmx->vcpu)) { - int index; - index = __find_msr_index(vmx, MSR_SYSCALL_MASK); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); @@ -916,9 +1137,6 @@ static void setup_msrs(struct vcpu_vmx * index = __find_msr_index(vmx, MSR_CSTAR); if (index >= 0) move_msr_up(vmx, index, save_nmsrs++); - index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); - if (index >= 0) - move_msr_up(vmx, index, save_nmsrs++); /* * MSR_K6_STAR is only needed on long mode guests, and only * if efer.sce is enabled. @@ -928,13 +1146,11 @@ static void setup_msrs(struct vcpu_vmx * move_msr_up(vmx, index, save_nmsrs++); } #endif - vmx->save_nmsrs = save_nmsrs; + vmx->msr_offset_efer = index = __find_msr_index(vmx, MSR_EFER); + if (index >= 0 && update_transition_efer(vmx)) + move_msr_up(vmx, index, save_nmsrs++); -#ifdef CONFIG_X86_64 - vmx->msr_offset_kernel_gs_base = - __find_msr_index(vmx, MSR_KERNEL_GS_BASE); -#endif - vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER); + vmx->save_nmsrs = save_nmsrs; if (cpu_has_vmx_msr_bitmap()) { if (is_long_mode(&vmx->vcpu)) @@ -960,12 +1176,37 @@ static u64 guest_read_tsc(void) } /* - * writes 'guest_tsc' into guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc + * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ + * ioctl. In this case the call-back should update internal vmx state to make + * the changes effective. + */ +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + /* Nothing to do here */ +} + +/* + * writes 'offset' into guest's timestamp counter offset register */ -static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) +static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); + trace_kvm_write_tsc_offset(vcpu->vcpu_id, vmcs_read64(TSC_OFFSET), + offset); + vmcs_write64(TSC_OFFSET, offset); +} + +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) +{ + u64 offset = vmcs_read64(TSC_OFFSET); + vmcs_write64(TSC_OFFSET, offset + adjustment); + + trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset, + offset + adjustment); +} + +static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + return target_tsc - native_read_tsc(); } /* @@ -976,7 +1217,7 @@ static void guest_write_tsc(u64 guest_ts static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) { u64 data; - struct kvm_msr_entry *msr; + struct shared_msr_entry *msr; if (!pdata) { printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); @@ -991,9 +1232,13 @@ static int vmx_get_msr(struct kvm_vcpu * case MSR_GS_BASE: data = vmcs_readl(GUEST_GS_BASE); break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(to_vmx(vcpu)); + data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + break; +#endif case MSR_EFER: return kvm_get_msr_common(vcpu, msr_index, pdata); -#endif case MSR_IA32_TSC: data = guest_read_tsc(); break; @@ -1007,6 +1252,7 @@ static int vmx_get_msr(struct kvm_vcpu * data = vmcs_readl(GUEST_SYSENTER_ESP); break; default: + vmx_load_host_state(to_vmx(vcpu)); msr = find_msr_entry(to_vmx(vcpu), msr_index); if (msr) { vmx_load_host_state(to_vmx(vcpu)); @@ -1025,17 +1271,18 @@ static int vmx_get_msr(struct kvm_vcpu * * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr; - u64 host_tsc; + struct shared_msr_entry *msr; int ret = 0; + u32 msr_index = msr_info->index; + u64 data = msr_info->data; switch (msr_index) { case MSR_EFER: vmx_load_host_state(vmx); - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: @@ -1044,6 +1291,10 @@ static int vmx_set_msr(struct kvm_vcpu * case MSR_GS_BASE: vmcs_writel(GUEST_GS_BASE, data); break; + case MSR_KERNEL_GS_BASE: + vmx_load_host_state(vmx); + vmx->msr_guest_kernel_gs_base = data; + break; #endif case MSR_IA32_SYSENTER_CS: vmcs_write32(GUEST_SYSENTER_CS, data); @@ -1055,8 +1306,7 @@ static int vmx_set_msr(struct kvm_vcpu * vmcs_writel(GUEST_SYSENTER_ESP, data); break; case MSR_IA32_TSC: - rdtscll(host_tsc); - guest_write_tsc(data, host_tsc); + kvm_write_tsc(vcpu, msr_info); break; case MSR_IA32_CR_PAT: if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { @@ -1072,7 +1322,7 @@ static int vmx_set_msr(struct kvm_vcpu * msr->data = data; break; } - ret = kvm_set_msr_common(vcpu, msr_index, data); + ret = kvm_set_msr_common(vcpu, msr_info); } return ret; @@ -1133,32 +1383,67 @@ static __init int vmx_disabled_by_bios(v u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) - == FEATURE_CONTROL_LOCKED; + if (msr & FEATURE_CONTROL_LOCKED) { + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) + && tboot_enabled()) + return 1; + if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) + && !tboot_enabled()) + return 1; + } + + return 0; /* locked but not enabled */ } -static void hardware_enable(void *garbage) +static void kvm_cpu_vmxon(u64 addr) +{ + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&addr), "m"(addr) + : "memory", "cc"); +} + +static int hardware_enable(void *garbage) { int cpu = raw_smp_processor_id(); u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); - u64 old; + u64 old, test_bits; + + if (read_cr4() & X86_CR4_VMXE) + return -EBUSY; INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); + + /* + * Now we can enable the vmclear operation in kdump + * since the vcpus_on_cpu list on this cpu + * has been initialized. + * + * Though the cpu is not in VMX operation now, there + * is no problem to enable the vmclear operation + * for the vcpus_on_cpu list is empty! + */ + crash_enable_local_vmclear(cpu); + rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) - != (FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED)) + + test_bits = FEATURE_CONTROL_LOCKED; + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + if (tboot_enabled()) + test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX; + + if ((old & test_bits) != test_bits) { /* enable and lock */ - wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - FEATURE_CONTROL_LOCKED | - FEATURE_CONTROL_VMXON_ENABLED); + wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); + } write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - asm volatile (ASM_VMX_VMXON_RAX - : : "a"(&phys_addr), "m"(phys_addr) - : "memory", "cc"); + + if (vmm_exclusive) { + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); + } + + return 0; } static void vmclear_local_vcpus(void) @@ -1178,13 +1463,15 @@ static void vmclear_local_vcpus(void) static void kvm_cpu_vmxoff(void) { asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); - write_cr4(read_cr4() & ~X86_CR4_VMXE); } static void hardware_disable(void *garbage) { - vmclear_local_vcpus(); - kvm_cpu_vmxoff(); + if (vmm_exclusive) { + vmclear_local_vcpus(); + kvm_cpu_vmxoff(); + } + write_cr4(read_cr4() & ~X86_CR4_VMXE); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -1206,6 +1493,14 @@ static __init int adjust_vmx_controls(u3 return 0; } +static __init bool allow_1_setting(u32 msr, u32 ctl) +{ + u32 vmx_msr_low, vmx_msr_high; + + rdmsr(msr, vmx_msr_low, vmx_msr_high); + return vmx_msr_high & ctl; +} + static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) { u32 vmx_msr_low, vmx_msr_high; @@ -1222,7 +1517,7 @@ static __init int setup_vmcs_config(stru &_pin_based_exec_control) < 0) return -EIO; - min = CPU_BASED_HLT_EXITING | + min = #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | @@ -1232,7 +1527,12 @@ static __init int setup_vmcs_config(stru CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_INVLPG_EXITING; + CPU_BASED_INVLPG_EXITING | + CPU_BASED_RDPMC_EXITING; + + if (yield_on_hlt) + min |= CPU_BASED_HLT_EXITING; + opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -1250,7 +1550,9 @@ static __init int setup_vmcs_config(stru SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_EPT | - SECONDARY_EXEC_UNRESTRICTED_GUEST; + SECONDARY_EXEC_UNRESTRICTED_GUEST | + SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_ENABLE_INVPCID; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -1312,6 +1614,42 @@ static __init int setup_vmcs_config(stru vmcs_conf->vmexit_ctrl = _vmexit_control; vmcs_conf->vmentry_ctrl = _vmentry_control; + cpu_has_load_perf_global_ctrl = + allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS, + VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) + && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS, + VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL); + + /* + * Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL + * but due to arrata below it can't be used. Workaround is to use + * msr load mechanism to switch IA32_PERF_GLOBAL_CTRL. + * + * VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32] + * + * AAK155 (model 26) + * AAP115 (model 30) + * AAT100 (model 37) + * BC86,AAY89,BD102 (model 44) + * BA97 (model 46) + * + */ + if (cpu_has_load_perf_global_ctrl && boot_cpu_data.x86 == 0x6) { + switch (boot_cpu_data.x86_model) { + case 26: + case 30: + case 37: + case 44: + case 46: + cpu_has_load_perf_global_ctrl = false; + printk_once(KERN_WARNING"kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL " + "does not work properly. Using workaround\n"); + break; + default: + break; + } + } + return 0; } @@ -1344,15 +1682,17 @@ static void free_kvm_area(void) { int cpu; - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) { free_vmcs(per_cpu(vmxarea, cpu)); + per_cpu(vmxarea, cpu) = NULL; + } } static __init int alloc_kvm_area(void) { int cpu; - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { struct vmcs *vmcs; vmcs = alloc_vmcs_cpu(cpu); @@ -1380,8 +1720,12 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_ept()) { enable_ept = 0; enable_unrestricted_guest = 0; + enable_ept_ad_bits = 0; } + if (!cpu_has_vmx_ept_ad_bits()) + enable_ept_ad_bits = 0; + if (!cpu_has_vmx_unrestricted_guest()) enable_unrestricted_guest = 0; @@ -1394,6 +1738,9 @@ static __init int hardware_setup(void) if (enable_ept && !cpu_has_vmx_ept_2m_page()) kvm_disable_largepages(); + if (!cpu_has_vmx_ple()) + ple_gap = 0; + return alloc_kvm_area(); } @@ -1426,13 +1773,14 @@ static void enter_pmode(struct kvm_vcpu vmx->emulation_required = 1; vmx->rmode.vm86_active = 0; + vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); flags = vmcs_readl(GUEST_RFLAGS); - flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); - flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); + flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; vmcs_writel(GUEST_RFLAGS, flags); vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | @@ -1459,8 +1807,12 @@ static void enter_pmode(struct kvm_vcpu static gva_t rmode_tss_base(struct kvm *kvm) { if (!kvm->arch.tss_addr) { - gfn_t base_gfn = kvm->memslots[0].base_gfn + - kvm->memslots[0].npages - 3; + struct kvm_memslots *slots; + gfn_t base_gfn; + + slots = rcu_dereference(kvm->memslots); + base_gfn = kvm->memslots->memslots[0].base_gfn + + kvm->memslots->memslots[0].npages - 3; return base_gfn << PAGE_SHIFT; } return kvm->arch.tss_addr; @@ -1475,15 +1827,20 @@ static void fix_rmode_seg(int seg, struc save->limit = vmcs_read32(sf->limit); save->ar = vmcs_read32(sf->ar_bytes); vmcs_write16(sf->selector, save->base >> 4); - vmcs_write32(sf->base, save->base & 0xfffff); + vmcs_write32(sf->base, save->base & 0xffff0); vmcs_write32(sf->limit, 0xffff); vmcs_write32(sf->ar_bytes, 0xf3); + if (save->base & 0xf) + printk_once(KERN_WARNING "kvm: segment base is not paragraph" + " aligned when entering protected mode (seg=%d)", + seg); } static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_segment var; if (enable_unrestricted_guest) return; @@ -1491,6 +1848,19 @@ static void enter_rmode(struct kvm_vcpu vmx->emulation_required = 1; vmx->rmode.vm86_active = 1; + /* + * Very old userspace does not call KVM_SET_TSS_ADDR before entering + * vcpu. Call it here with phys address pointing 16M below 4G. + */ + if (!vcpu->kvm->arch.tss_addr) { + printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be " + "called before entering vcpu\n"); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + vmx_set_tss_addr(vcpu->kvm, 0xfeffd000); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + } + + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1501,8 +1871,7 @@ static void enter_rmode(struct kvm_vcpu vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); flags = vmcs_readl(GUEST_RFLAGS); - vmx->rmode.save_iopl - = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; + vmx->rmode.save_rflags = flags; flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1513,31 +1882,41 @@ static void enter_rmode(struct kvm_vcpu if (emulate_invalid_guest_state) goto continue_rmode; - vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); - vmcs_write32(GUEST_SS_LIMIT, 0xffff); - vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); - - vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) - vmcs_writel(GUEST_CS_BASE, 0xf0000); - vmcs_write16(GUEST_CS_SELECTOR, vmcs_readl(GUEST_CS_BASE) >> 4); - - fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); - fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); - fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); - fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_get_segment(vcpu, &var, VCPU_SREG_SS); + vmx_set_segment(vcpu, &var, VCPU_SREG_SS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_CS); + vmx_set_segment(vcpu, &var, VCPU_SREG_CS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_ES); + vmx_set_segment(vcpu, &var, VCPU_SREG_ES); + + vmx_get_segment(vcpu, &var, VCPU_SREG_DS); + vmx_set_segment(vcpu, &var, VCPU_SREG_DS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_GS); + vmx_set_segment(vcpu, &var, VCPU_SREG_GS); + + vmx_get_segment(vcpu, &var, VCPU_SREG_FS); + vmx_set_segment(vcpu, &var, VCPU_SREG_FS); continue_rmode: kvm_mmu_reset_context(vcpu); - init_rmode(vcpu->kvm); } static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); + if (!msr) + return; + + /* + * Force kernel_gs_base reloading before EFER changes, as control + * of this msr depends on is_long_mode(). + */ + vmx_load_host_state(to_vmx(vcpu)); vcpu->arch.shadow_efer = efer; if (!msr) return; @@ -1592,10 +1971,20 @@ static void vmx_flush_tlb(struct kvm_vcp ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } +static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) +{ + ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~cr0_guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; +} + static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) { - vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; + ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~cr4_guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; } static void ept_load_pdptrs(struct kvm_vcpu *vcpu) @@ -1640,7 +2029,7 @@ static void ept_update_paging_mode_cr0(u (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, @@ -1648,7 +2037,7 @@ static void ept_update_paging_mode_cr0(u ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, vcpu->arch.cr4); + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); } if (!(cr0 & X86_CR0_WP)) @@ -1676,8 +2065,6 @@ static void vmx_set_cr0(struct kvm_vcpu else hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; - vmx_fpu_deactivate(vcpu); - if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -1696,12 +2083,12 @@ static void vmx_set_cr0(struct kvm_vcpu if (enable_ept) ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); + if (!vcpu->fpu_active) + hw_cr0 |= X86_CR0_TS; + vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; - - if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE)) - vmx_fpu_activate(vcpu); } static u64 construct_eptp(unsigned long root_hpa) @@ -1711,6 +2098,8 @@ static u64 construct_eptp(unsigned long /* TODO write the value reading from MSR */ eptp = VMX_EPT_DEFAULT_MT | VMX_EPT_DEFAULT_GAW << VMX_EPT_GAW_EPTP_SHIFT; + if (enable_ept_ad_bits) + eptp |= VMX_EPT_AD_ENABLE_BIT; eptp |= (root_hpa & PAGE_MASK); return eptp; @@ -1731,8 +2120,6 @@ static void vmx_set_cr3(struct kvm_vcpu vmx_flush_tlb(vcpu); vmcs_writel(GUEST_CR3, guest_cr3); - if (vcpu->arch.cr0 & X86_CR0_PE) - vmx_fpu_deactivate(vcpu); } static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -1780,7 +2167,7 @@ static void vmx_get_segment(struct kvm_v static int vmx_get_cpl(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) /* if real mode */ return 0; if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ @@ -1819,6 +2206,7 @@ static void vmx_set_segment(struct kvm_v u32 ar; if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { + vmcs_write16(sf->selector, var->selector); vmx->rmode.tr.selector = var->selector; vmx->rmode.tr.base = var->base; vmx->rmode.tr.limit = var->limit; @@ -1853,6 +2241,44 @@ static void vmx_set_segment(struct kvm_v ar |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, ar); + + /* + * Fix segments for real mode guest in hosts that don't have + * "unrestricted_mode" or it was disabled. + * This is done to allow migration of the guests from hosts with + * unrestricted guest like Westmere to older host that don't have + * unrestricted guest like Nehelem. + */ + if (!enable_unrestricted_guest && vmx->rmode.vm86_active) { + switch (seg) { + case VCPU_SREG_CS: + vmcs_write32(GUEST_CS_AR_BYTES, 0xf3); + vmcs_write32(GUEST_CS_LIMIT, 0xffff); + if (vmcs_readl(GUEST_CS_BASE) == 0xffff0000) + vmcs_writel(GUEST_CS_BASE, 0xf0000); + vmcs_write16(GUEST_CS_SELECTOR, + vmcs_readl(GUEST_CS_BASE) >> 4); + break; + case VCPU_SREG_ES: + fix_rmode_seg(VCPU_SREG_ES, &vmx->rmode.es); + break; + case VCPU_SREG_DS: + fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.ds); + break; + case VCPU_SREG_GS: + fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.gs); + break; + case VCPU_SREG_FS: + fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.fs); + break; + case VCPU_SREG_SS: + vmcs_write16(GUEST_SS_SELECTOR, + vmcs_readl(GUEST_SS_BASE) >> 4); + vmcs_write32(GUEST_SS_LIMIT, 0xffff); + vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); + break; + } + } } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -2035,7 +2461,7 @@ static bool cs_ss_rpl_check(struct kvm_v static bool guest_state_valid(struct kvm_vcpu *vcpu) { /* real mode guest state checks */ - if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) return false; if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) @@ -2079,11 +2505,12 @@ static bool guest_state_valid(struct kvm static int init_rmode_tss(struct kvm *kvm) { - gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; + gfn_t fn; u16 data = 0; - int ret = 0; - int r; + int r, idx, ret = 0; + idx = srcu_read_lock(&kvm->srcu); + fn = rmode_tss_base(kvm) >> PAGE_SHIFT; r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2107,12 +2534,13 @@ static int init_rmode_tss(struct kvm *kv ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } static int init_rmode_identity_map(struct kvm *kvm) { - int i, r, ret; + int i, idx, r, ret; pfn_t identity_map_pfn; u32 tmp; @@ -2127,6 +2555,7 @@ static int init_rmode_identity_map(struc return 1; ret = 0; identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; + idx = srcu_read_lock(&kvm->srcu); r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); if (r < 0) goto out; @@ -2142,6 +2571,7 @@ static int init_rmode_identity_map(struc kvm->arch.ept_identity_pagetable_done = true; ret = 1; out: + srcu_read_unlock(&kvm->srcu, idx); return ret; } @@ -2168,7 +2598,7 @@ static int alloc_apic_access_page(struct struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.apic_access_page) goto out; kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; @@ -2181,7 +2611,7 @@ static int alloc_apic_access_page(struct kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2190,7 +2620,7 @@ static int alloc_identity_pagetable(stru struct kvm_userspace_memory_region kvm_userspace_mem; int r = 0; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); if (kvm->arch.ept_identity_pagetable) goto out; kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; @@ -2205,7 +2635,7 @@ static int alloc_identity_pagetable(stru kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2225,6 +2655,16 @@ static void allocate_vpid(struct vcpu_vm spin_unlock(&vmx_vpid_lock); } +static void free_vpid(struct vcpu_vmx *vmx) +{ + if (!enable_vpid) + return; + spin_lock(&vmx_vpid_lock); + if (vmx->vpid != 0) + __clear_bit(vmx->vpid, vmx_vpid_bitmap); + spin_unlock(&vmx_vpid_lock); +} + static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr) { int f = sizeof(unsigned long); @@ -2261,12 +2701,13 @@ static int vmx_vcpu_setup(struct vcpu_vm { u32 host_sysenter_cs, msr_low, msr_high; u32 junk; - u64 host_pat, tsc_this, tsc_base; + u64 host_pat; unsigned long a; struct descriptor_table dt; int i; unsigned long kvm_vmx_return; u32 exec_control; + unsigned long cr4; /* I/O */ vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); @@ -2302,26 +2743,42 @@ static int vmx_vcpu_setup(struct vcpu_vm ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; - if (!enable_ept) + if (!enable_ept) { exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; + enable_unrestricted_guest = 0; + /* Enable INVPCID for non-ept guests may cause performance regression. */ + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + } if (!enable_unrestricted_guest) exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; + if (!ple_gap) + exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } + if (ple_gap) { + vmcs_write32(PLE_GAP, ple_gap); + vmx->ple_window = ple_window; + vmx->ple_window_dirty = true; + } + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ vmcs_writel(HOST_CR0, read_cr0()); /* 22.2.3 */ - vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ + + /* Save the most likely value for this task's CR4 in the VMCS. */ + cr4 = read_cr4(); + vmcs_writel(HOST_CR4, read_cr4()); /* 22.2.3, 22.2.5 */ + vmx->host_state.vmcs_host_cr4 = cr4; vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, 0); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, 0); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); @@ -2342,7 +2799,9 @@ static int vmx_vcpu_setup(struct vcpu_vm vmcs_writel(HOST_RIP, kvm_vmx_return); /* 22.2.5 */ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, 0); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); rdmsr(MSR_IA32_SYSENTER_CS, host_sysenter_cs, junk); vmcs_write32(HOST_IA32_SYSENTER_CS, host_sysenter_cs); @@ -2376,10 +2835,9 @@ static int vmx_vcpu_setup(struct vcpu_vm if (wrmsr_safe(index, data_low, data_high) < 0) continue; data = data_low | ((u64)data_high << 32); - vmx->host_msrs[j].index = index; - vmx->host_msrs[j].reserved = 0; - vmx->host_msrs[j].data = data; - vmx->guest_msrs[j] = vmx->host_msrs[j]; + vmx->guest_msrs[j].index = i; + vmx->guest_msrs[j].data = 0; + vmx->guest_msrs[j].mask = -1ull; ++vmx->nmsrs; } @@ -2390,26 +2848,11 @@ static int vmx_vcpu_setup(struct vcpu_vm vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); - - tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; - rdtscll(tsc_this); - if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc) - tsc_base = tsc_this; - - guest_write_tsc(0, tsc_base); + vmx->vcpu.arch.cr4_guest_owned_bits = ~KVM_GUEST_CR4_MASK; return 0; } -static int init_rmode(struct kvm *kvm) -{ - if (!init_rmode_tss(kvm)) - return 0; - if (!init_rmode_identity_map(kvm)) - return 0; - return 1; -} - static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2417,11 +2860,6 @@ static int vmx_vcpu_reset(struct kvm_vcp int ret; vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); - down_read(&vcpu->kvm->slots_lock); - if (!init_rmode(vmx->vcpu.kvm)) { - ret = -ENOMEM; - goto out; - } vmx->rmode.vm86_active = 0; @@ -2484,7 +2922,7 @@ static int vmx_vcpu_reset(struct kvm_vcp vmcs_writel(GUEST_IDTR_BASE, 0); vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); - vmcs_write32(GUEST_ACTIVITY_STATE, 0); + vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); @@ -2510,8 +2948,10 @@ static int vmx_vcpu_reset(struct kvm_vcp if (vmx->vpid != 0) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx->vcpu.arch.cr0 = 0x60000010; - vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ + vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */ + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); vmx_set_cr4(&vmx->vcpu, 0); vmx_set_efer(&vmx->vcpu, 0); vmx_fpu_activate(&vmx->vcpu); @@ -2524,8 +2964,6 @@ static int vmx_vcpu_reset(struct kvm_vcp /* HACK: Don't enable emulation on guest boot/reset */ vmx->emulation_required = 0; -out: - up_read(&vcpu->kvm->slots_lock); return ret; } @@ -2547,6 +2985,10 @@ static void enable_nmi_window(struct kvm return; } + if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + enable_irq_window(vcpu); + return; + } cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); @@ -2582,6 +3024,7 @@ static void vmx_inject_irq(struct kvm_vc } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); + vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -2615,6 +3058,7 @@ static void vmx_inject_nmi(struct kvm_vc } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); + vmx_clear_hlt(vcpu); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -2623,8 +3067,36 @@ static int vmx_nmi_allowed(struct kvm_vc return 0; return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | - GUEST_INTR_STATE_NMI)); + (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI + | GUEST_INTR_STATE_NMI)); +} + +static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) +{ + if (!cpu_has_virtual_nmis()) + return to_vmx(vcpu)->soft_vnmi_blocked; + else + return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + GUEST_INTR_STATE_NMI); +} + +static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!cpu_has_virtual_nmis()) { + if (vmx->soft_vnmi_blocked != masked) { + vmx->soft_vnmi_blocked = masked; + vmx->vnmi_blocked_time = 0; + } + } else { + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } } static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) @@ -2648,6 +3120,9 @@ static int vmx_set_tss_addr(struct kvm * if (ret) return ret; kvm->arch.tss_addr = addr; + if (!init_rmode_tss(kvm)) + return -ENOMEM; + return 0; } @@ -2659,7 +3134,7 @@ static int handle_rmode_exception(struct * Cause the #SS fault with 0 error code in VM86 mode. */ if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) - if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) + if (emulate_instruction(vcpu, 0) == EMULATE_DONE) return 1; /* * Forward all other exceptions that are valid in real mode. @@ -2674,6 +3149,12 @@ static int handle_rmode_exception(struct kvm_queue_exception(vcpu, vec); return 1; case BP_VECTOR: + /* + * Update instruction length as we may reinject the exception + * from user space while in guest debugging mode. + */ + to_vmx(vcpu)->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) return 0; /* fall through */ @@ -2710,15 +3191,16 @@ static void kvm_machine_check(void) #endif } -static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_machine_check(struct kvm_vcpu *vcpu) { /* already handled by vcpu_run */ return 1; } -static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_exception(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_run *kvm_run = vcpu->run; u32 intr_info, ex_no, error_code; unsigned long cr2, rip, dr6; u32 vect_info; @@ -2728,12 +3210,17 @@ static int handle_exception(struct kvm_v intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (is_machine_check(intr_info)) - return handle_machine_check(vcpu, kvm_run); + return handle_machine_check(vcpu); if ((vect_info & VECTORING_INFO_VALID_MASK) && - !is_page_fault(intr_info)) - printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " - "intr info 0x%x\n", __func__, vect_info, intr_info); + !is_page_fault(intr_info)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = vect_info; + vcpu->run->internal.data[1] = intr_info; + return 0; + } if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) return 1; /* already handled by vmx_vcpu_run() */ @@ -2744,7 +3231,7 @@ static int handle_exception(struct kvm_v } if (is_invalid_opcode(intr_info)) { - er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); + er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); if (er != EMULATE_DONE) kvm_queue_exception(vcpu, UD_VECTOR); return 1; @@ -2763,7 +3250,7 @@ static int handle_exception(struct kvm_v if (kvm_event_needs_reinjection(vcpu)) kvm_mmu_unprotect_page_virt(vcpu, cr2); - return kvm_mmu_page_fault(vcpu, cr2, error_code); + return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0); } if (vmx->rmode.vm86_active && @@ -2790,6 +3277,13 @@ static int handle_exception(struct kvm_v kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); /* fall through */ case BP_VECTOR: + /* + * Update instruction length as we may reinject #BP from + * user space while in guest debugging mode. Reading it for + * #DB as well causes no harm, it is not used in that case. + */ + vmx->vcpu.arch.event_exit_inst_len = + vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; @@ -2803,20 +3297,19 @@ static int handle_exception(struct kvm_v return 0; } -static int handle_external_interrupt(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_external_interrupt(struct kvm_vcpu *vcpu) { ++vcpu->stat.irq_exits; return 1; } -static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_triple_fault(struct kvm_vcpu *vcpu) { - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; } -static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; int size, in, string; @@ -2827,8 +3320,7 @@ static int handle_io(struct kvm_vcpu *vc string = (exit_qualification & 16) != 0; if (string) { - if (emulate_instruction(vcpu, - kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) + if (emulate_instruction(vcpu, 0) == EMULATE_DO_MMIO) return 0; return 1; } @@ -2838,7 +3330,7 @@ static int handle_io(struct kvm_vcpu *vc port = exit_qualification >> 16; skip_emulated_instruction(vcpu); - return kvm_emulate_pio(vcpu, kvm_run, in, size, port); + return kvm_emulate_pio(vcpu, in, size, port); } static void @@ -2852,11 +3344,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcp hypercall[2] = 0xc1; } -static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void complete_insn_gp(struct kvm_vcpu *vcpu, int err) +{ + if (err) + kvm_inject_gp(vcpu, 0); + else + skip_emulated_instruction(vcpu); +} + +static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; int cr; int reg; + int err; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -2867,36 +3368,36 @@ static int handle_cr(struct kvm_vcpu *vc trace_kvm_cr_write(cr, val); switch (cr) { case 0: - kvm_set_cr0(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr0(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 3: - kvm_set_cr3(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr3(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 4: - kvm_set_cr4(vcpu, val); - skip_emulated_instruction(vcpu); + err = kvm_set_cr4(vcpu, val); + complete_insn_gp(vcpu, err); return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = kvm_register_read(vcpu, reg); - kvm_set_cr8(vcpu, cr8); - skip_emulated_instruction(vcpu); + if (kvm_set_cr8(vcpu, cr8)) + kvm_inject_gp(vcpu, 0); + else + skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu->kvm)) return 1; if (cr8_prev <= cr8) return 1; - kvm_run->exit_reason = KVM_EXIT_SET_TPR; + vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } }; break; case 2: /* clts */ - vmx_fpu_deactivate(vcpu); - vcpu->arch.cr0 &= ~X86_CR0_TS; - vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); - vmx_fpu_activate(vcpu); + vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); + trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); skip_emulated_instruction(vcpu); return 1; case 1: /*mov from cr*/ @@ -2915,20 +3416,22 @@ static int handle_cr(struct kvm_vcpu *vc } break; case 3: /* lmsw */ - kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); + val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f; + trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); + kvm_lmsw(vcpu, val); skip_emulated_instruction(vcpu); return 1; default: break; } - kvm_run->exit_reason = 0; + vcpu->run->exit_reason = 0; pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", (int)(exit_qualification >> 4) & 3, cr); return 0; } -static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_dr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; unsigned long val; @@ -2944,13 +3447,13 @@ static int handle_dr(struct kvm_vcpu *vc * guest debugging itself. */ if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { - kvm_run->debug.arch.dr6 = vcpu->arch.dr6; - kvm_run->debug.arch.dr7 = dr; - kvm_run->debug.arch.pc = + vcpu->run->debug.arch.dr6 = vcpu->arch.dr6; + vcpu->run->debug.arch.dr7 = dr; + vcpu->run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + vmcs_readl(GUEST_RIP); - kvm_run->debug.arch.exception = DB_VECTOR; - kvm_run->exit_reason = KVM_EXIT_DEBUG; + vcpu->run->debug.arch.exception = DB_VECTOR; + vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { vcpu->arch.dr7 &= ~DR7_GD; @@ -2988,7 +3491,7 @@ static int handle_dr(struct kvm_vcpu *vc vcpu->arch.eff_db[dr] = val; break; case 4 ... 5: - if (vcpu->arch.cr4 & X86_CR4_DE) + if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) kvm_queue_exception(vcpu, UD_VECTOR); break; case 6: @@ -3016,13 +3519,13 @@ static int handle_dr(struct kvm_vcpu *vc return 1; } -static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_cpuid(struct kvm_vcpu *vcpu) { kvm_emulate_cpuid(vcpu); return 1; } -static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_rdmsr(struct kvm_vcpu *vcpu) { u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data; @@ -3041,15 +3544,19 @@ static int handle_rdmsr(struct kvm_vcpu return 1; } -static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_wrmsr(struct kvm_vcpu *vcpu) { + struct msr_data msr; u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); trace_kvm_msr_write(ecx, data); - if (vmx_set_msr(vcpu, ecx, data) != 0) { + msr.data = data; + msr.index = ecx; + msr.host_initiated = false; + if (vmx_set_msr(vcpu, &msr) != 0) { kvm_inject_gp(vcpu, 0); return 1; } @@ -3058,14 +3565,12 @@ static int handle_wrmsr(struct kvm_vcpu return 1; } -static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) { return 1; } -static int handle_interrupt_window(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int handle_interrupt_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -3081,34 +3586,34 @@ static int handle_interrupt_window(struc * possible */ if (!irqchip_in_kernel(vcpu->kvm) && - kvm_run->request_interrupt_window && + vcpu->run->request_interrupt_window && !kvm_cpu_has_interrupt(vcpu)) { - kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; + vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; return 0; } return 1; } -static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_halt(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); return kvm_emulate_halt(vcpu); } -static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_vmcall(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); kvm_emulate_hypercall(vcpu); return 1; } -static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_vmx_insn(struct kvm_vcpu *vcpu) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } -static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_invlpg(struct kvm_vcpu *vcpu) { unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3117,14 +3622,34 @@ static int handle_invlpg(struct kvm_vcpu return 1; } -static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_rdpmc(struct kvm_vcpu *vcpu) +{ + int err; + + err = kvm_rdpmc(vcpu); + complete_insn_gp(vcpu, err); + + return 1; +} + +static int handle_wbinvd(struct kvm_vcpu *vcpu) { skip_emulated_instruction(vcpu); /* TODO: Add support for VT-d/pass-through device */ return 1; } -static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_xsetbv(struct kvm_vcpu *vcpu) +{ + u64 new_bv = kvm_read_edx_eax(vcpu); + u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); + + if (kvm_set_xcr(vcpu, index, new_bv) == 0) + skip_emulated_instruction(vcpu); + return 1; +} + +static int handle_apic_access(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; enum emulation_result er; @@ -3133,7 +3658,7 @@ static int handle_apic_access(struct kvm exit_qualification = vmcs_readl(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + er = emulate_instruction(vcpu, 0); if (er != EMULATE_DONE) { printk(KERN_ERR @@ -3144,14 +3669,17 @@ static int handle_apic_access(struct kvm return 1; } -static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_task_switch(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification; + bool has_error_code = false; + u32 error_code = 0; u16 tss_selector; - int reason, type, idt_v; + int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -3170,6 +3698,13 @@ static int handle_task_switch(struct kvm kvm_clear_interrupt_queue(vcpu); break; case INTR_TYPE_HARD_EXCEPTION: + if (vmx->idt_vectoring_info & + VECTORING_INFO_DELIVER_CODE_MASK) { + has_error_code = true; + error_code = + vmcs_read32(IDT_VECTORING_ERROR_CODE); + } + /* fall through */ case INTR_TYPE_SOFT_EXCEPTION: kvm_clear_exception_queue(vcpu); break; @@ -3184,7 +3719,9 @@ static int handle_task_switch(struct kvm type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (!kvm_task_switch(vcpu, tss_selector, reason)) + if (!kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, + has_error_code, error_code)) return 0; /* clear all local breakpoint enable flags */ @@ -3198,7 +3735,7 @@ static int handle_task_switch(struct kvm return 1; } -static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_ept_violation(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; gpa_t gpa; @@ -3219,14 +3756,25 @@ static int handle_ept_violation(struct k vmcs_readl(GUEST_LINEAR_ADDRESS)); printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", (long unsigned int)exit_qualification); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; return 0; } + /* + * EPT violation happened while executing iret from NMI, + * "blocked by NMI" bit has to be set before next VM entry. + * There are errata that may cause this bit to not be set: + * AAK134, BY25. + */ + if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && + cpu_has_virtual_nmis() && + (exit_qualification & INTR_INFO_UNBLOCK_NMI)) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); trace_kvm_page_fault(gpa, exit_qualification); - return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); + return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0, NULL, 0); } static u64 ept_rsvd_mask(u64 spte, int level) @@ -3290,7 +3838,7 @@ static void ept_misconfig_inspect_spte(s } } -static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_ept_misconfig(struct kvm_vcpu *vcpu) { u64 sptes[4]; int nr_sptes, i; @@ -3306,13 +3854,13 @@ static int handle_ept_misconfig(struct k for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; return 0; } -static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int handle_nmi_window(struct kvm_vcpu *vcpu) { u32 cpu_based_vm_exec_control; @@ -3325,8 +3873,7 @@ static int handle_nmi_window(struct kvm_ return 1; } -static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static void handle_invalid_guest_state(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); enum emulation_result err = EMULATE_DONE; @@ -3335,7 +3882,7 @@ static void handle_invalid_guest_state(s preempt_enable(); while (!guest_state_valid(vcpu)) { - err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + err = emulate_instruction(vcpu, 0); if (err == EMULATE_DO_MMIO) break; @@ -3357,13 +3904,97 @@ static void handle_invalid_guest_state(s vmx->invalid_state_emulation_result = err; } +static int __grow_ple_window(int val) +{ + if (ple_window_grow < 1) + return ple_window; + + val = min(val, ple_window_actual_max); + + if (ple_window_grow < ple_window) + val *= ple_window_grow; + else + val += ple_window_grow; + + return val; +} + +static int __shrink_ple_window(int val, int modifier, int minimum) +{ + if (modifier < 1) + return ple_window; + + if (modifier < ple_window) + val /= modifier; + else + val -= modifier; + + return max(val, minimum); +} + +static void grow_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __grow_ple_window(old); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_grow(vcpu->vcpu_id, vmx->ple_window, old); +} + +static void shrink_ple_window(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int old = vmx->ple_window; + + vmx->ple_window = __shrink_ple_window(old, + ple_window_shrink, ple_window); + + if (vmx->ple_window != old) + vmx->ple_window_dirty = true; + + trace_kvm_ple_window_shrink(vcpu->vcpu_id, vmx->ple_window, old); +} + +/* + * ple_window_actual_max is computed to be one grow_ple_window() below + * ple_window_max. (See __grow_ple_window for the reason.) + * This prevents overflows, because ple_window_max is int. + * ple_window_max effectively rounded down to a multiple of ple_window_grow in + * this process. + * ple_window_max is also prevented from setting vmx->ple_window < ple_window. + */ +static void update_ple_window_actual_max(void) +{ + ple_window_actual_max = + __shrink_ple_window(max(ple_window_max, ple_window), + ple_window_grow, INT_MIN); +} + +/* + * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE + * exiting, so only get here on cpu with PAUSE-Loop-Exiting. + */ +static int handle_pause(struct kvm_vcpu *vcpu) +{ + if (ple_gap) + grow_ple_window(vcpu); + + skip_emulated_instruction(vcpu); + kvm_vcpu_on_spin(vcpu); + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs * to be done to userspace and return 0. */ -static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) = { +static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, @@ -3377,6 +4008,7 @@ static int (*kvm_vmx_exit_handlers[])(st [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, [EXIT_REASON_INVLPG] = handle_invlpg, + [EXIT_REASON_RDPMC] = handle_rdpmc, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_VMCLEAR] = handle_vmx_insn, [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, @@ -3389,11 +4021,15 @@ static int (*kvm_vmx_exit_handlers[])(st [EXIT_REASON_VMON] = handle_vmx_insn, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, + [EXIT_REASON_INVVPID] = handle_vmx_insn, + [EXIT_REASON_INVEPT] = handle_vmx_insn, [EXIT_REASON_WBINVD] = handle_wbinvd, + [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, + [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, }; static const int kvm_vmx_max_exit_handlers = @@ -3403,13 +4039,13 @@ static const int kvm_vmx_max_exit_handle * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +static int vmx_handle_exit(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); + trace_kvm_exit(exit_reason, kvm_rip_read(vcpu), KVM_ISA_VMX); /* If we need to emulate an MMIO from handle_invalid_guest_state * we just return 0 */ @@ -3424,9 +4060,16 @@ static int vmx_handle_exit(struct kvm_ru if (enable_ept && is_paging(vcpu)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason + = exit_reason; + return 0; + } + if (unlikely(vmx->fail)) { - kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; - kvm_run->fail_entry.hardware_entry_failure_reason + vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; + vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); return 0; } @@ -3459,10 +4102,10 @@ static int vmx_handle_exit(struct kvm_ru if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) - return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); + return kvm_vmx_exit_handlers[exit_reason](vcpu); else { - kvm_run->exit_reason = KVM_EXIT_UNKNOWN; - kvm_run->hw.hardware_exit_reason = exit_reason; + vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; + vcpu->run->hw.hardware_exit_reason = exit_reason; } return 0; } @@ -3498,8 +4141,11 @@ static void vmx_complete_interrupts(stru /* We need to handle NMIs before interrupts are enabled */ if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR && - (exit_intr_info & INTR_INFO_VALID_MASK)) + (exit_intr_info & INTR_INFO_VALID_MASK)) { + kvm_before_handle_nmi(&vmx->vcpu); asm("int $2"); + kvm_after_handle_nmi(&vmx->vcpu); + } idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; @@ -3592,6 +4238,24 @@ static void fixup_rmode_irq(struct vcpu_ | vmx->rmode.irq.vector; } +static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) +{ + int i, nr_msrs; + struct perf_guest_switch_msr *msrs; + + msrs = perf_guest_get_msrs(&nr_msrs); + + if (!msrs) + return; + + for (i = 0; i < nr_msrs; i++) + if (msrs[i].host == msrs[i].guest) + clear_atomic_switch_msr(vmx, msrs[i].msr); + else + add_atomic_switch_msr(vmx, msrs[i].msr, msrs[i].guest, + msrs[i].host); +} + #ifdef CONFIG_X86_64 #define R "r" #define Q "q" @@ -3600,9 +4264,10 @@ static void fixup_rmode_irq(struct vcpu_ #define Q "l" #endif -static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + unsigned long cr4; if (enable_ept && is_paging(vcpu)) { vmcs_writel(GUEST_CR3, vcpu->arch.cr3); @@ -3614,15 +4279,25 @@ static void vmx_vcpu_run(struct kvm_vcpu /* Handle invalid guest state instead of entering VMX */ if (vmx->emulation_required && emulate_invalid_guest_state) { - handle_invalid_guest_state(vcpu, kvm_run); + handle_invalid_guest_state(vcpu); return; } + if (vmx->ple_window_dirty) { + vmx->ple_window_dirty = false; + vmcs_write32(PLE_WINDOW, vmx->ple_window); + } + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + cr4 = read_cr4(); + if(unlikely(cr4 != vmx->host_state.vmcs_host_cr4)) { + vmcs_writel(HOST_CR4, cr4); + vmx->host_state.vmcs_host_cr4 = cr4; + } /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -3639,6 +4314,8 @@ static void vmx_vcpu_run(struct kvm_vcpu if (vcpu->arch.switch_db_regs) set_debugreg(vcpu->arch.dr6, 6); + atomic_switch_perf_msrs(vmx); + asm( /* Store host registers */ "push %%"R"dx; push %%"R"bp;" @@ -3770,17 +4447,26 @@ static void vmx_free_vcpu(struct kvm_vcp { struct vcpu_vmx *vmx = to_vmx(vcpu); - spin_lock(&vmx_vpid_lock); - if (vmx->vpid != 0) - __clear_bit(vmx->vpid, vmx_vpid_bitmap); - spin_unlock(&vmx_vpid_lock); + free_vpid(vmx); vmx_free_vmcs(vcpu); - kfree(vmx->host_msrs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); } +static inline void vmcs_init(struct vmcs *vmcs) +{ + u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id())); + + if (!vmm_exclusive) + kvm_cpu_vmxon(phys_addr); + + vmcs_clear(vmcs); + + if (!vmm_exclusive) + kvm_cpu_vmxoff(); +} + static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) { int err; @@ -3802,18 +4488,15 @@ static struct kvm_vcpu *vmx_create_vcpu( goto uninit_vcpu; } - vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!vmx->host_msrs) - goto free_guest_msrs; - vmx->vmcs = alloc_vmcs(); if (!vmx->vmcs) goto free_msrs; - vmcs_clear(vmx->vmcs); + vmcs_init(vmx->vmcs); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); + vmx->vcpu.cpu = cpu; err = vmx_vcpu_setup(vmx); vmx_vcpu_put(&vmx->vcpu); put_cpu(); @@ -3827,8 +4510,11 @@ static struct kvm_vcpu *vmx_create_vcpu( if (!kvm->arch.ept_identity_map_addr) kvm->arch.ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR; + err = -ENOMEM; if (alloc_identity_pagetable(kvm) != 0) goto free_vmcs; + if (!init_rmode_identity_map(kvm)) + goto free_vmcs; } return &vmx->vcpu; @@ -3836,12 +4522,11 @@ static struct kvm_vcpu *vmx_create_vcpu( free_vmcs: free_vmcs(vmx->vmcs); free_msrs: - kfree(vmx->host_msrs); -free_guest_msrs: kfree(vmx->guest_msrs); uninit_vcpu: kvm_vcpu_uninit(&vmx->vcpu); free_vcpu: + free_vpid(vmx); kmem_cache_free(kvm_vcpu_cache, vmx); return ERR_PTR(err); } @@ -3893,32 +4578,45 @@ static u64 vmx_get_mt_mask(struct kvm_vc return ret; } -static const struct trace_print_flags vmx_exit_reasons_str[] = { - { EXIT_REASON_EXCEPTION_NMI, "exception" }, - { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, - { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, - { EXIT_REASON_NMI_WINDOW, "nmi_window" }, - { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, - { EXIT_REASON_CR_ACCESS, "cr_access" }, - { EXIT_REASON_DR_ACCESS, "dr_access" }, - { EXIT_REASON_CPUID, "cpuid" }, - { EXIT_REASON_MSR_READ, "rdmsr" }, - { EXIT_REASON_MSR_WRITE, "wrmsr" }, - { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, - { EXIT_REASON_HLT, "halt" }, - { EXIT_REASON_INVLPG, "invlpg" }, - { EXIT_REASON_VMCALL, "hypercall" }, - { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, - { EXIT_REASON_APIC_ACCESS, "apic_access" }, - { EXIT_REASON_WBINVD, "wbinvd" }, - { EXIT_REASON_TASK_SWITCH, "task_switch" }, - { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, - { -1, NULL } -}; +static void vmx_cpuid_update(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + u32 exec_control; + + /* Exposing INVPCID only when PCID is exposed */ + best = kvm_find_cpuid_entry(vcpu, 0x7, 0); + if (vmx_invpcid_supported() && + best && (best->ebx & bit(X86_FEATURE_INVPCID)) && + guest_cpuid_has_pcid(vcpu)) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control |= SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } else { + if (cpu_has_secondary_exec_ctrls()) { + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, + exec_control); + } + if (best) + best->ebx &= ~bit(X86_FEATURE_INVPCID); + } +} -static bool vmx_gb_page_enable(void) +static int vmx_get_lpage_level(void) { - return false; + if (enable_ept && !cpu_has_vmx_ept_1g_page()) + return PT_DIRECTORY_LEVEL; + else + /* For shadow and EPT supported 1GB page */ + return PT_PDPE_LEVEL; +} + +static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + if (ple_gap) + shrink_ple_window(vcpu); } static struct kvm_x86_ops vmx_x86_ops = { @@ -3947,6 +4645,7 @@ static struct kvm_x86_ops vmx_x86_ops = .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, + .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr3 = vmx_set_cr3, @@ -3959,6 +4658,7 @@ static struct kvm_x86_ops vmx_x86_ops = .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, + .fpu_deactivate = vmx_fpu_deactivate, .tlb_flush = vmx_flush_tlb, @@ -3973,6 +4673,8 @@ static struct kvm_x86_ops vmx_x86_ops = .queue_exception = vmx_queue_exception, .interrupt_allowed = vmx_interrupt_allowed, .nmi_allowed = vmx_nmi_allowed, + .get_nmi_mask = vmx_get_nmi_mask, + .set_nmi_mask = vmx_set_nmi_mask, .enable_nmi_window = enable_nmi_window, .enable_irq_window = enable_irq_window, .update_cr8_intercept = update_cr8_intercept, @@ -3981,13 +4683,27 @@ static struct kvm_x86_ops vmx_x86_ops = .get_tdp_level = get_ept_level, .get_mt_mask = vmx_get_mt_mask, - .exit_reasons_str = vmx_exit_reasons_str, - .gb_page_enable = vmx_gb_page_enable, + .get_lpage_level = vmx_get_lpage_level, + + .cpuid_update = vmx_cpuid_update, + .invpcid_supported = vmx_invpcid_supported, + + .set_tsc_khz = vmx_set_tsc_khz, + .write_tsc_offset = vmx_write_tsc_offset, + .adjust_tsc_offset = vmx_adjust_tsc_offset, + .compute_tsc_offset = vmx_compute_tsc_offset, + + .sched_in = vmx_sched_in, }; static int __init vmx_init(void) { - int r; + int r, i; + + rdmsrl_safe(MSR_EFER, &host_efer); + + for (i = 0; i < NR_VMX_MSR; ++i) + kvm_define_shared_msr(i, vmx_msr_index[i]); vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx_io_bitmap_a) @@ -4025,10 +4741,16 @@ static int __init vmx_init(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ - r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE); + r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), + __alignof__(struct vcpu_vmx), THIS_MODULE); if (r) goto out3; +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, + crash_vmclear_local_loaded_vmcss); +#endif + vmx_disable_intercept_for_msr(MSR_FS_BASE, false); vmx_disable_intercept_for_msr(MSR_GS_BASE, false); vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true); @@ -4040,8 +4762,10 @@ static int __init vmx_init(void) bypass_guest_pf = 0; kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK); - kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, - VMX_EPT_EXECUTABLE_MASK); + kvm_mmu_set_mask_ptes(0ull, + (enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull, + (enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull, + 0ull, VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); } else kvm_disable_tdp(); @@ -4049,7 +4773,7 @@ static int __init vmx_init(void) if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); - ept_sync_global(); + update_ple_window_actual_max(); return 0; @@ -4071,6 +4795,11 @@ static void __exit vmx_exit(void) free_page((unsigned long)vmx_io_bitmap_b); free_page((unsigned long)vmx_io_bitmap_a); +#ifdef CONFIG_KEXEC + rcu_assign_pointer(crash_vmclear_loaded_vmcss, NULL); + synchronize_rcu(); +#endif + kvm_exit(); } diff -uprN linux-2.6.32/arch/x86/kvm/x86.c linux-2.6.32.ovz/arch/x86/kvm/x86.c --- linux-2.6.32/arch/x86/kvm/x86.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/x86.c 2015-03-10 13:24:15.000000000 -0700 @@ -37,6 +37,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #undef TRACE_INCLUDE_FILE #define CREATE_TRACE_POINTS @@ -47,6 +52,9 @@ #include #include #include +#include +#include +#include #define MAX_IO_MSRS 256 #define CR0_RESERVED_BITS \ @@ -56,13 +64,15 @@ #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ + | X86_CR4_OSXSAVE \ + | X86_CR4_SMEP | X86_CR4_RDWRGSFS \ | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define KVM_MAX_MCE_BANKS 32 -#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P +#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) /* EFER defaults: * - enable syscall per default because its emulated by KVM @@ -87,6 +97,30 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +bool kvm_has_tsc_control; +EXPORT_SYMBOL_GPL(kvm_has_tsc_control); +u32 kvm_max_guest_tsc_khz; +EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); + +#define KVM_NR_SHARED_MSRS 16 + +struct kvm_shared_msrs_global { + int nr; + struct kvm_shared_msr { + u32 msr; + u64 value; + } msrs[KVM_NR_SHARED_MSRS]; +}; + +struct kvm_shared_msrs { + struct user_return_notifier urn; + bool registered; + u64 current_value[KVM_NR_SHARED_MSRS]; +}; + +static struct kvm_shared_msrs_global __read_mostly shared_msrs_global; +static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs); + struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, { "pf_guest", VCPU_STAT(pf_guest) }, @@ -123,6 +157,74 @@ struct kvm_stats_debugfs_item debugfs_en { NULL } }; +u64 __read_mostly host_xcr0; + +static void kvm_on_user_return(struct user_return_notifier *urn) +{ + unsigned slot; + struct kvm_shared_msr *global; + struct kvm_shared_msrs *locals + = container_of(urn, struct kvm_shared_msrs, urn); + + for (slot = 0; slot < shared_msrs_global.nr; ++slot) { + global = &shared_msrs_global.msrs[slot]; + if (global->value != locals->current_value[slot]) { + wrmsrl(global->msr, global->value); + locals->current_value[slot] = global->value; + } + } + locals->registered = false; + user_return_notifier_unregister(urn); +} + +void kvm_define_shared_msr(unsigned slot, u32 msr) +{ + int cpu; + u64 value; + + if (slot >= shared_msrs_global.nr) + shared_msrs_global.nr = slot + 1; + shared_msrs_global.msrs[slot].msr = msr; + rdmsrl_safe(msr, &value); + shared_msrs_global.msrs[slot].value = value; + for_each_online_cpu(cpu) + per_cpu(shared_msrs, cpu).current_value[slot] = value; +} +EXPORT_SYMBOL_GPL(kvm_define_shared_msr); + +static void kvm_shared_msr_cpu_online(void) +{ + unsigned i; + struct kvm_shared_msrs *locals = &__get_cpu_var(shared_msrs); + + for (i = 0; i < shared_msrs_global.nr; ++i) + locals->current_value[i] = shared_msrs_global.msrs[i].value; +} + +void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask) +{ + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + + if (((value ^ smsr->current_value[slot]) & mask) == 0) + return; + smsr->current_value[slot] = value; + wrmsrl(shared_msrs_global.msrs[slot].msr, value); + if (!smsr->registered) { + smsr->urn.on_user_return = kvm_on_user_return; + user_return_notifier_register(&smsr->urn); + smsr->registered = true; + } +} +EXPORT_SYMBOL_GPL(kvm_set_shared_msr); + +static void drop_user_return_notifiers(void *ignore) +{ + struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs); + + if (smsr->registered) + kvm_on_user_return(&smsr->urn); +} + unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -170,12 +272,68 @@ void kvm_set_apic_base(struct kvm_vcpu * } EXPORT_SYMBOL_GPL(kvm_set_apic_base); +#define EXCPT_BENIGN 0 +#define EXCPT_CONTRIBUTORY 1 +#define EXCPT_PF 2 + +static int exception_class(int vector) +{ + switch (vector) { + case PF_VECTOR: + return EXCPT_PF; + case DE_VECTOR: + case TS_VECTOR: + case NP_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + return EXCPT_CONTRIBUTORY; + default: + break; + } + return EXCPT_BENIGN; +} + +static void kvm_multiple_exception(struct kvm_vcpu *vcpu, + unsigned nr, bool has_error, u32 error_code) +{ + u32 prev_nr; + int class1, class2; + + if (!vcpu->arch.exception.pending) { + queue: + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = has_error; + vcpu->arch.exception.nr = nr; + vcpu->arch.exception.error_code = error_code; + return; + } + + /* to check exception */ + prev_nr = vcpu->arch.exception.nr; + if (prev_nr == DF_VECTOR) { + /* triple fault -> shutdown */ + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); + return; + } + class1 = exception_class(prev_nr); + class2 = exception_class(nr); + if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) + || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { + /* generate double fault per SDM Table 5-5 */ + vcpu->arch.exception.pending = true; + vcpu->arch.exception.has_error_code = true; + vcpu->arch.exception.nr = DF_VECTOR; + vcpu->arch.exception.error_code = 0; + } else + /* replace previous exception with a new one in a hope + that instruction re-execution will regenerate lost + exception */ + goto queue; +} + void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = false; - vcpu->arch.exception.nr = nr; + kvm_multiple_exception(vcpu, nr, false, 0); } EXPORT_SYMBOL_GPL(kvm_queue_exception); @@ -183,25 +341,6 @@ void kvm_inject_page_fault(struct kvm_vc u32 error_code) { ++vcpu->stat.pf_guest; - - if (vcpu->arch.exception.pending) { - switch(vcpu->arch.exception.nr) { - case DF_VECTOR: - /* triple fault -> shutdown */ - set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); - return; - case PF_VECTOR: - vcpu->arch.exception.nr = DF_VECTOR; - vcpu->arch.exception.error_code = 0; - return; - default: - /* replace previous exception with a new one in a hope - that instruction re-execution will regenerate lost - exception */ - vcpu->arch.exception.pending = false; - break; - } - } vcpu->arch.cr2 = addr; kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } @@ -214,11 +353,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { - WARN_ON(vcpu->arch.exception.pending); - vcpu->arch.exception.pending = true; - vcpu->arch.exception.has_error_code = true; - vcpu->arch.exception.nr = nr; - vcpu->arch.exception.error_code = error_code; + kvm_multiple_exception(vcpu, nr, true, error_code); } EXPORT_SYMBOL_GPL(kvm_queue_exception_e); @@ -294,138 +429,225 @@ out: return changed; } -void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) +int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { - if (cr0 & CR0_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", - cr0, vcpu->arch.cr0); - kvm_inject_gp(vcpu, 0); - return; - } + if (cr0 & CR0_RESERVED_BITS) + return 1; - if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { - printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n"); - kvm_inject_gp(vcpu, 0); - return; - } + cr0 |= X86_CR0_ET; - if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { - printk(KERN_DEBUG "set_cr0: #GP, set PG flag " - "and a clear PE flag\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) + return 1; + + if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) + return 1; if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { #ifdef CONFIG_X86_64 if ((vcpu->arch.shadow_efer & EFER_LME)) { int cs_db, cs_l; - if (!is_pae(vcpu)) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while PAE is disabled\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (!is_pae(vcpu)) + return 1; kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - if (cs_l) { - printk(KERN_DEBUG "set_cr0: #GP, start paging " - "in long mode while CS.L == 1\n"); - kvm_inject_gp(vcpu, 0); - return; - - } + if (cs_l) + return 1; } else #endif - if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr0: #GP, pdptrs " - "reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) + return 1; } + if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) + return 1; + kvm_x86_ops->set_cr0(vcpu, cr0); vcpu->arch.cr0 = cr0; kvm_mmu_reset_context(vcpu); - return; + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr0); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) { - kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); + (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f)); } EXPORT_SYMBOL_GPL(kvm_lmsw); -void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && + !vcpu->guest_xcr0_loaded) { + /* kvm_set_xcr() also depends on this */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + vcpu->guest_xcr0_loaded = 1; + } +} + +static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +{ + if (vcpu->guest_xcr0_loaded) { + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0); + vcpu->guest_xcr0_loaded = 0; + } +} + +int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { - unsigned long old_cr4 = vcpu->arch.cr4; - unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; + u64 xcr0; - if (cr4 & CR4_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n"); + /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ + if (index != XCR_XFEATURE_ENABLED_MASK) + return 1; + xcr0 = xcr; + if (kvm_x86_ops->get_cpl(vcpu) != 0) + return 1; + if (!(xcr0 & XSTATE_FP)) + return 1; + if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE)) + return 1; + if (xcr0 & ~host_xcr0) + return 1; + kvm_put_guest_xcr0(vcpu); + vcpu->arch.xcr0 = xcr0; + return 0; +} + +int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) +{ + if (__kvm_set_xcr(vcpu, index, xcr)) { kvm_inject_gp(vcpu, 0); + return 1; + } + return 0; +} +EXPORT_SYMBOL_GPL(kvm_set_xcr); + +static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + if (!static_cpu_has(X86_FEATURE_XSAVE)) + return 0; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_XSAVE)); +} + +static void update_cpuid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + struct kvm_lapic *apic = vcpu->arch.apic; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + if (!best) return; + + /* Update OSXSAVE bit */ + if (cpu_has_xsave && best->function == 0x1) { + best->ecx &= ~(bit(X86_FEATURE_OSXSAVE)); + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) + best->ecx |= bit(X86_FEATURE_OSXSAVE); } + if (apic) { + if (best->ecx & bit(X86_FEATURE_TSC_DEADLINE_TIMER)) + apic->lapic_timer.timer_mode_mask = 3 << 17; + else + apic->lapic_timer.timer_mode_mask = 1 << 17; + } + + kvm_pmu_cpuid_update(vcpu); +} + +static bool guest_cpuid_has_smep(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_SMEP)); +} + +static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); +} + +int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) +{ + unsigned long old_cr4 = kvm_read_cr4(vcpu); + unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | + X86_CR4_PAE | X86_CR4_SMEP; + if (cr4 & CR4_RESERVED_BITS) + return 1; + + if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE)) + return 1; + + if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP)) + return 1; + + if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS)) + return 1; + if (is_long_mode(vcpu)) { - if (!(cr4 & X86_CR4_PAE)) { - printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while " - "in long mode\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (!(cr4 & X86_CR4_PAE)) + return 1; } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) && ((cr4 ^ old_cr4) & pdptr_bits) - && !load_pdptrs(vcpu, vcpu->arch.cr3)) { - printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } + && !load_pdptrs(vcpu, vcpu->arch.cr3)) + return 1; - if (cr4 & X86_CR4_VMXE) { - printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n"); - kvm_inject_gp(vcpu, 0); - return; + if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { + if (!guest_cpuid_has_pcid(vcpu)) + return 1; + + /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ + if ((vcpu->arch.cr3 & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) + return 1; } + + if (cr4 & X86_CR4_VMXE) + return 1; kvm_x86_ops->set_cr4(vcpu, cr4); vcpu->arch.cr4 = cr4; vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + + if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE) + update_cpuid(vcpu); + + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr4); -void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) +int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); - return; + return 0; } if (is_long_mode(vcpu)) { - if (cr3 & CR3_L_MODE_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) { + if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS) + return 1; + } else + if (cr3 & CR3_L_MODE_RESERVED_BITS) + return 1; } else { if (is_pae(vcpu)) { - if (cr3 & CR3_PAE_RESERVED_BITS) { - printk(KERN_DEBUG - "set_cr3: #GP, reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } - if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { - printk(KERN_DEBUG "set_cr3: #GP, pdptrs " - "reserved bits\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (cr3 & CR3_PAE_RESERVED_BITS) + return 1; + if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) + return 1; } /* * We don't check reserved bits in nonpae mode, because @@ -443,25 +665,22 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, * to debug) behavior on the guest side. */ if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) - kvm_inject_gp(vcpu, 0); - else { - vcpu->arch.cr3 = cr3; - vcpu->arch.mmu.new_cr3(vcpu); - } + return 1; + vcpu->arch.cr3 = cr3; + vcpu->arch.mmu.new_cr3(vcpu); + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr3); -void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) +int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) { - if (cr8 & CR8_RESERVED_BITS) { - printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8); - kvm_inject_gp(vcpu, 0); - return; - } + if (cr8 & CR8_RESERVED_BITS) + return 1; if (irqchip_in_kernel(vcpu->kvm)) kvm_lapic_set_tpr(vcpu, cr8); else vcpu->arch.cr8 = cr8; + return 0; } EXPORT_SYMBOL_GPL(kvm_set_cr8); @@ -474,81 +693,75 @@ unsigned long kvm_get_cr8(struct kvm_vcp } EXPORT_SYMBOL_GPL(kvm_get_cr8); -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - /* * List of msr numbers which we expose to userspace through KVM_GET_MSRS * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. * * This list is modified at module load time to reflect the - * capabilities of the host cpu. + * capabilities of the host cpu. This capabilities test skips MSRs that are + * kvm-specific. Those are put in the beginning of the list. */ + +#define KVM_SAVE_MSRS_BEGIN 8 static u32 msrs_to_save[] = { + MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, + MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, + HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, + MSR_KVM_STEAL_TIME, + MSR_KVM_PV_EOI_EN, MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, MSR_K6_STAR, #ifdef CONFIG_X86_64 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, #endif - MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, - MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA + MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA }; static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { + MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, + MSR_IA32_MCG_STATUS, + MSR_IA32_MCG_CTL, }; -static void set_efer(struct kvm_vcpu *vcpu, u64 efer) +static int set_efer(struct kvm_vcpu *vcpu, u64 efer) { - if (efer & efer_reserved_bits) { - printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n", - efer); - kvm_inject_gp(vcpu, 0); - return; - } + if (efer & efer_reserved_bits) + return 1; if (is_paging(vcpu) - && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { - printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n"); - kvm_inject_gp(vcpu, 0); - return; - } + && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) + return 1; if (efer & EFER_FFXSR) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { - printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) + return 1; } if (efer & EFER_SVME) { struct kvm_cpuid_entry2 *feat; feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); - if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { - printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n"); - kvm_inject_gp(vcpu, 0); - return; - } + if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) + return 1; } - kvm_x86_ops->set_efer(vcpu, efer); - efer &= ~EFER_LMA; efer |= vcpu->arch.shadow_efer & EFER_LMA; + kvm_x86_ops->set_efer(vcpu, efer); + vcpu->arch.shadow_efer = efer; vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; kvm_mmu_reset_context(vcpu); + + return 0; } void kvm_enable_efer_bits(u64 mask) @@ -563,42 +776,72 @@ EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); * Returns 0 on success, non-0 otherwise. * Assumes vcpu_load() was already called. */ -int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) +int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + return kvm_x86_ops->set_msr(vcpu, msr); +} + +bool kvm_rdpmc(struct kvm_vcpu *vcpu) { - return kvm_x86_ops->set_msr(vcpu, msr_index, data); + u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + u64 data; + int err; + + err = kvm_pmu_read_pmc(vcpu, ecx, &data); + if (err) + return err; + kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data); + kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32); + return err; } +EXPORT_SYMBOL_GPL(kvm_rdpmc); /* * Adapt set_msr() to msr_io()'s calling convention */ static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) { - return kvm_set_msr(vcpu, index, *data); + struct msr_data msr; + + msr.data = *data; + msr.index = index; + msr.host_initiated = true; + return kvm_set_msr(vcpu, &msr); } static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) { - static int version; + int version; + int r; struct pvclock_wall_clock wc; - struct timespec now, sys, boot; + struct timespec boot; if (!wall_clock) return; - version++; + r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version)); + if (r) + return; + + if (version & 1) + ++version; /* first time write, random junk */ + + ++version; kvm_write_guest(kvm, wall_clock, &version, sizeof(version)); /* * The guest calculates current wall clock time by adding - * system time (updated by kvm_write_guest_time below) to the + * system time (updated by kvm_guest_time_update below) to the * wall clock specified here. guest system time equals host * system time for us, thus we must fill in host boot time here. */ - now = current_kernel_time(); - ktime_get_ts(&sys); - boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys)); + getboottime(&boot); + if (kvm->arch.kvmclock_offset) { + struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset); + boot = timespec_sub(boot, ts); + } wc.sec = boot.tv_sec; wc.nsec = boot.tv_nsec; wc.version = version; @@ -621,63 +864,201 @@ static uint32_t div_frac(uint32_t divide return quotient; } -static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) +static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz, + s8 *pshift, u32 *pmultiplier) { - uint64_t nsecs = 1000000000LL; + uint64_t scaled64; int32_t shift = 0; uint64_t tps64; uint32_t tps32; - tps64 = tsc_khz * 1000LL; - while (tps64 > nsecs*2) { + tps64 = base_khz * 1000LL; + scaled64 = scaled_khz * 1000LL; + while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { tps64 >>= 1; shift--; } tps32 = (uint32_t)tps64; - while (tps32 <= (uint32_t)nsecs) { - tps32 <<= 1; + while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { + if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) + scaled64 >>= 1; + else + tps32 <<= 1; shift++; } - hv_clock->tsc_shift = shift; - hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); + *pshift = shift; + *pmultiplier = div_frac(scaled64, tps32); - pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __func__, tsc_khz, hv_clock->tsc_shift, - hv_clock->tsc_to_system_mul); + pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n", + __func__, base_khz, scaled_khz, shift, *pmultiplier); } static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); +unsigned long max_tsc_khz; -static void kvm_write_guest_time(struct kvm_vcpu *v) +static inline u64 get_kernel_ns(void) { struct timespec ts; + + WARN_ON(preemptible()); + ktime_get_ts(&ts); + monotonic_to_bootbased(&ts); + return timespec_to_ns(&ts); +} + +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.virtual_tsc_khz) + return vcpu->arch.virtual_tsc_khz; + else + return __get_cpu_var(cpu_tsc_khz); +} + +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +{ + u64 ret; + + ret = nsec * vcpu_tsc_khz(vcpu); + do_div(ret, USEC_PER_SEC); + return ret; +} + +static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) +{ + /* Compute a scale to convert nanoseconds in TSC cycles */ + kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, + &vcpu->arch.tsc_catchup_shift, + &vcpu->arch.tsc_catchup_mult); +} + +static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) +{ + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, + vcpu->arch.tsc_catchup_mult, + vcpu->arch.tsc_catchup_shift); + tsc += vcpu->arch.last_tsc_write; + return tsc; +} + +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) +{ + struct kvm *kvm = vcpu->kvm; + u64 offset, ns, elapsed; + unsigned long flags; + s64 sdiff; + u64 delta; + u64 data = msr->data; + + spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); + ns = get_kernel_ns(); + elapsed = ns - kvm->arch.last_tsc_nsec; + sdiff = data - kvm->arch.last_tsc_write; + if (sdiff < 0) + sdiff = -sdiff; + + /* + * Special case: TSC write with a small delta (1 second) of virtual + * cycle time against real time is interpreted as an attempt + * to synchronize the CPU. + * + * For a reliable TSC, we can match TSC offsets, and for an + * unstable TSC, we will write the update, but ignore elapsed time + * in this computation. The reason for this is that unstable TSC + * will be compensated by the catchup code, and guest loops which + * continually write the TSC could end up overshooting the TSC if + * the elapsed time is factored in. + */ + delta = nsec_to_cycles(vcpu, elapsed); + sdiff -= delta; + if (sdiff < 0) + sdiff = -sdiff; + if (sdiff < nsec_to_cycles(vcpu, 1 * NSEC_PER_SEC) ) { + if (!check_tsc_unstable()) { + offset = kvm->arch.last_tsc_offset; + pr_debug("kvm: matched tsc offset for %llu\n", data); + } else { + /* Unstable write; don't add elapsed time */ + pr_debug("kvm: matched write on unstable tsc\n"); + } + } else { + kvm->arch.last_tsc_nsec = ns; + kvm->arch.last_tsc_write = data; + kvm->arch.last_tsc_offset = offset; + } + kvm_x86_ops->write_tsc_offset(vcpu, offset); + spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + + /* Reset of TSC must disable overshoot protection below */ + vcpu->arch.hv_clock.tsc_timestamp = 0; + vcpu->arch.last_tsc_write = data; + vcpu->arch.last_tsc_nsec = ns; +} +EXPORT_SYMBOL_GPL(kvm_write_tsc); + +static int kvm_guest_time_update(struct kvm_vcpu *v) +{ unsigned long flags; struct kvm_vcpu_arch *vcpu = &v->arch; - void *shared_kaddr; unsigned long this_tsc_khz; + s64 kernel_ns; + u64 tsc_timestamp; + struct pvclock_vcpu_time_info guest_hv_clock; + u8 pvclock_flags; + bool use_cached_fc; + gpa_t time; - if ((!vcpu->time_page)) - return; + /* Keep irq disabled to prevent changes to the clock */ + local_irq_save(flags); + kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + kernel_ns = get_kernel_ns(); + this_tsc_khz = vcpu_tsc_khz(v); + + if (unlikely(this_tsc_khz == 0)) { + local_irq_restore(flags); + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + return 1; + } - this_tsc_khz = get_cpu_var(cpu_tsc_khz); - if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { - kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); - vcpu->hv_clock_tsc_khz = this_tsc_khz; + /* + * We may have to catch up the TSC to match elapsed wall clock + * time for two reasons, even if kvmclock is used. + * 1) CPU could have been running below the maximum TSC rate + * 2) Broken TSC compensation resets the base at each VCPU + * entry to avoid unknown leaps of TSC even when running + * again on the same CPU. This may cause apparent elapsed + * time to disappear, and the guest to stand still or run + * very slowly. + */ + if (vcpu->tsc_catchup) { + u64 tsc = compute_guest_tsc(v, kernel_ns); + if (tsc > tsc_timestamp) { + adjust_tsc_offset_guest(v, tsc - tsc_timestamp); + tsc_timestamp = tsc; + } } - put_cpu_var(cpu_tsc_khz); - /* Keep irq disabled to prevent changes to the clock */ - local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); - ktime_get_ts(&ts); local_irq_restore(flags); + if (!vcpu->pv_time_enabled) + return 0; + + if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { + kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + &vcpu->hv_clock.tsc_shift, + &vcpu->hv_clock.tsc_to_system_mul); + vcpu->hw_tsc_khz = this_tsc_khz; + } + /* With all the info we got, fill in the values */ + vcpu->hv_clock.tsc_timestamp = tsc_timestamp; + vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; + vcpu->last_kernel_ns = kernel_ns; + vcpu->last_guest_tsc = tsc_timestamp; + - vcpu->hv_clock.system_time = ts.tv_nsec + - (NSEC_PER_SEC * (u64)ts.tv_sec); /* * The interface expects us to write an even number signaling that the * update is finished. Since the guest won't see the intermediate @@ -685,24 +1066,91 @@ static void kvm_write_guest_time(struct */ vcpu->hv_clock.version += 2; - shared_kaddr = kmap_atomic(vcpu->time_page, KM_USER0); + time = vcpu->time & ~1ULL; + use_cached_fc = sizeof(guest_hv_clock) <= (PAGE_SIZE - + offset_in_page(time)); + + if (likely(use_cached_fc)) { + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return -EFAULT; + } else if (unlikely(kvm_read_guest(v->kvm, time, + &guest_hv_clock, sizeof(guest_hv_clock)))) + return -EFAULT; + + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ + pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED); + + if (vcpu->pvclock_set_guest_stopped_request) { + pvclock_flags |= PVCLOCK_GUEST_STOPPED; + vcpu->pvclock_set_guest_stopped_request = false; + } + + vcpu->hv_clock.flags = pvclock_flags; + + if (likely(use_cached_fc)) + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); + else + kvm_write_guest(v->kvm, time, &vcpu->hv_clock, + sizeof(guest_hv_clock)); + + return 0; +} + +/* + * kvmclock updates which are isolated to a given vcpu, such as + * vcpu->cpu migration, should not allow system_timestamp from + * the rest of the vcpus to remain static. Otherwise ntp frequency + * correction applies to one vcpu's system_timestamp but not + * the others. + * + * So in those cases, request a kvmclock update for all vcpus. + * We need to rate-limit these requests though, as they can + * considerably slow guests that have a large number of vcpus. + * The time for a remote vcpu to update its kvmclock is bound + * by the delay we use to rate-limit the updates. + */ - memcpy(shared_kaddr + vcpu->time_offset, &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); +#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) - kunmap_atomic(shared_kaddr, KM_USER0); +static void kvmclock_update_fn(struct work_struct *work) +{ + int i; + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_update_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + struct kvm_vcpu *vcpu; - mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); + kvm_for_each_vcpu(i, vcpu, kvm) { + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + kvm_vcpu_kick(vcpu); + } } -static int kvm_request_guest_time_update(struct kvm_vcpu *v) +static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) { - struct kvm_vcpu_arch *vcpu = &v->arch; + struct kvm *kvm = v->kvm; - if (!vcpu->time_page) - return 0; - set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests); - return 1; + set_bit(KVM_REQ_CLOCK_UPDATE, &v->requests); + schedule_delayed_work(&kvm->arch.kvmclock_update_work, + KVMCLOCK_UPDATE_DELAY); +} + +#define KVMCLOCK_SYNC_PERIOD (300 * HZ) + +static void kvmclock_sync_fn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct kvm_arch *ka = container_of(dwork, struct kvm_arch, + kvmclock_sync_work); + struct kvm *kvm = container_of(ka, struct kvm, arch); + + schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0); + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); } static bool msr_mtrr_valid(unsigned msr) @@ -835,12 +1283,113 @@ static int set_msr_mce(struct kvm_vcpu * return 0; } -int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) +static void accumulate_steal_time(struct kvm_vcpu *vcpu) +{ + u64 delta; + + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + delta = current->sched_info.run_delay - vcpu->arch.st.last_steal; + vcpu->arch.st.last_steal = current->sched_info.run_delay; + vcpu->arch.st.accum_steal = delta; +} + +static void record_steal_time(struct kvm_vcpu *vcpu) { + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + if (unlikely(kvm_read_guest(vcpu->kvm, vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) + return; + + vcpu->arch.st.steal.steal += vcpu->arch.st.accum_steal; + vcpu->arch.st.steal.version += 2; + vcpu->arch.st.accum_steal = 0; + + kvm_write_guest(vcpu->kvm, vcpu->arch.st.stime, + &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); +} + +static bool kvm_hv_msr_partition_wide(u32 msr) +{ + bool r = false; + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + case HV_X64_MSR_HYPERCALL: + r = true; + break; + } + + return r; +} + +static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + kvm->arch.hv_guest_os_id = data; + /* setting guest os id to zero disables hypercall page */ + if (!kvm->arch.hv_guest_os_id) + kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE; + break; + case HV_X64_MSR_HYPERCALL: { + u64 gfn; + unsigned long addr; + u8 instructions[11]; + + /* if guest os id is not set hypercall should remain disabled */ + if (!kvm->arch.hv_guest_os_id) + break; + if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) { + kvm->arch.hv_hypercall = data; + break; + } + gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT; + addr = gfn_to_hva(kvm, gfn); + if (kvm_is_error_hva(addr)) + return 1; + ((unsigned char *)instructions)[0] = 0xb8; /* mov eax, 0x02 */ + ((unsigned char *)instructions)[1] = 0x02; + ((unsigned char *)instructions)[2] = 0x00; + ((unsigned char *)instructions)[3] = 0x00; + ((unsigned char *)instructions)[4] = 0x00; + ((unsigned char *)instructions)[5] = 0xba; /* mov edx, 0 */ + ((unsigned char *)instructions)[6] = 0x00; + ((unsigned char *)instructions)[7] = 0x00; + ((unsigned char *)instructions)[8] = 0x00; + ((unsigned char *)instructions)[9] = 0x00; + ((unsigned char *)instructions)[10] = 0xc3; /* ret */ + if (__copy_to_user((void __user *)addr, instructions, 11)) + return 1; + kvm->arch.hv_hypercall = data; + break; + } + default: + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + return 1; + } + return 0; +} + +static void kvmclock_reset(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pv_time_enabled = false; +} + +int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) +{ + bool pr = false; + u32 msr = msr_info->index; + u64 data = msr_info->data; + switch (msr) { case MSR_EFER: - set_efer(vcpu, data); - break; + return set_efer(vcpu, data); case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ if (data != 0) { @@ -882,39 +1431,64 @@ int kvm_set_msr_common(struct kvm_vcpu * break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_write(vcpu, msr, data); + case MSR_IA32_TSCDEADLINE: + kvm_set_lapic_tscdeadline_msr(vcpu, data); + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; + case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; kvm_write_wall_clock(vcpu->kvm, data); break; + case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); vcpu->arch.time = data; + set_bit(KVM_REQ_GLOBAL_CLOCK_UPDATE, &vcpu->requests); /* we verify if the enable bit is set... */ if (!(data & 1)) break; - /* ...but clean it before doing the actual write */ - vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, + &vcpu->arch.pv_time, data & ~1ULL)) + vcpu->arch.pv_time_enabled = false; + else + vcpu->arch.pv_time_enabled = true; - vcpu->arch.time_page = - gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); + break; + } + case MSR_KVM_STEAL_TIME: - if (is_error_page(vcpu->arch.time_page)) { - kvm_release_page_clean(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + if (unlikely(!sched_info_on())) + return 1; + + if (data & KVM_STEAL_RESERVED_MASK) + return 1; + + vcpu->arch.st.stime = data & KVM_STEAL_VALID_BITS; + vcpu->arch.st.msr_val = data; + + if (!(data & KVM_MSR_ENABLED)) + break; + + vcpu->arch.st.last_steal = current->sched_info.run_delay; + + preempt_disable(); + accumulate_steal_time(vcpu); + preempt_enable(); + + set_bit(KVM_REQ_STEAL_UPDATE, &vcpu->requests); - kvm_request_guest_time_update(vcpu); break; - } + case MSR_KVM_PV_EOI_EN: + if (kvm_lapic_enable_pv_eoi(vcpu, data)) + return 1; + break; + case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: @@ -927,8 +1501,6 @@ int kvm_set_msr_common(struct kvm_vcpu * * which we perfectly emulate ;-). Any other value should be at least * reported, some guests depend on them. */ - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: @@ -940,8 +1512,6 @@ int kvm_set_msr_common(struct kvm_vcpu * /* at least RHEL 4 unconditionally writes to the perfctr registers, * so we ignore writes to make it happy. */ - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: case MSR_K7_PERFCTR0: case MSR_K7_PERFCTR1: case MSR_K7_PERFCTR2: @@ -949,7 +1519,48 @@ int kvm_set_msr_common(struct kvm_vcpu * pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " "0x%x data 0x%llx\n", msr, data); break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + pr = true; + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); + + if (pr || data != 0) + pr_unimpl(vcpu, "disabled perfctr wrmsr: " + "0x%x data 0x%llx\n", msr, data); + break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = set_msr_hyperv_pw(vcpu, msr, data); + mutex_unlock(&vcpu->kvm->lock); + return r; + } + pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x " + "data 0x%llx\n", msr, data); + break; + case MSR_K7_CLK_CTL: + /* + * Ignore all writes to this no longer documented MSR. + * Writes are only relevant for old K7 processors, + * all pre-dating SVM, but a recommended workaround from + * AMD for these chips. It is possible to speicify the + * affected processor models on the command line, hence + * the need to ignore the workaround. + */ + break; + case MSR_IA32_BBL_CR_CTL3: + /* Drop writes to this undocumented MSR -- see rdmsr + * counterpart for further detail. + */ + pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); + break; default: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); @@ -1046,13 +1657,54 @@ static int get_msr_mce(struct kvm_vcpu * return 0; } +static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + struct kvm *kvm = vcpu->kvm; + + switch (msr) { + case HV_X64_MSR_GUEST_OS_ID: + data = kvm->arch.hv_guest_os_id; + break; + case HV_X64_MSR_HYPERCALL: + data = kvm->arch.hv_hypercall; + break; + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + + *pdata = data; + return 0; +} + +static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + u64 data = 0; + + switch (msr) { + case HV_X64_MSR_VP_INDEX: { + int r; + struct kvm_vcpu *v; + kvm_for_each_vcpu(r, v, vcpu->kvm) + if (v == vcpu) + data = r; + break; + } + default: + pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); + return 1; + } + *pdata = data; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; switch (msr) { case MSR_IA32_PLATFORM_ID: - case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: @@ -1062,10 +1714,6 @@ int kvm_get_msr_common(struct kvm_vcpu * case MSR_K8_SYSCFG: case MSR_K7_HWCR: case MSR_VM_HSAVE_PA: - case MSR_P6_PERFCTR0: - case MSR_P6_PERFCTR1: - case MSR_P6_EVNTSEL0: - case MSR_P6_EVNTSEL1: case MSR_K7_EVNTSEL0: case MSR_K7_PERFCTR0: case MSR_K8_INT_PENDING_MSG: @@ -1073,6 +1721,17 @@ int kvm_get_msr_common(struct kvm_vcpu * case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_get_msr(vcpu, msr, pdata); + data = 0; + break; + case MSR_IA32_UCODE_REV: + data = 0x100000000ULL; + break; case MSR_MTRRcap: data = 0x500 | KVM_NR_VAR_MTRR; break; @@ -1081,12 +1740,29 @@ int kvm_get_msr_common(struct kvm_vcpu * case 0xcd: /* fsb frequency */ data = 3; break; + /* + * MSR_EBC_FREQUENCY_ID + * Conservative value valid for even the basic CPU models. + * Models 0,1: 000 in bits 23:21 indicating a bus speed of + * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, + * and 266MHz for model 3, or 4. Set Core Clock + * Frequency to System Bus Frequency Ratio to 1 (bits + * 31:24) even though these are only valid for CPU + * models > 2, however guests may end up dividing or + * multiplying by zero otherwise. + */ + case MSR_EBC_FREQUENCY_ID: + data = 1 << 24; + break; case MSR_IA32_APICBASE: data = kvm_get_apic_base(vcpu); break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr, pdata); break; + case MSR_IA32_TSCDEADLINE: + data = kvm_get_lapic_tscdeadline_msr(vcpu); + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -1100,11 +1776,19 @@ int kvm_get_msr_common(struct kvm_vcpu * data = vcpu->arch.shadow_efer; break; case MSR_KVM_WALL_CLOCK: + case MSR_KVM_WALL_CLOCK_NEW: data = vcpu->kvm->arch.wall_clock; break; case MSR_KVM_SYSTEM_TIME: + case MSR_KVM_SYSTEM_TIME_NEW: data = vcpu->arch.time; break; + case MSR_KVM_STEAL_TIME: + data = vcpu->arch.st.msr_val; + break; + case MSR_KVM_PV_EOI_EN: + data = vcpu->arch.pv_eoi.msr_val; + break; case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: @@ -1112,7 +1796,44 @@ int kvm_get_msr_common(struct kvm_vcpu * case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: return get_msr_mce(vcpu, msr, pdata); + case MSR_K7_CLK_CTL: + /* + * Provide expected ramp-up count for K7. All other + * are set to zero, indicating minimum divisors for + * every field. + * + * This prevents guest kernels on AMD host with CPU + * type 6, model 8 and higher from exploding due to + * the rdmsr failing. + */ + data = 0x20000000; + break; + case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + if (kvm_hv_msr_partition_wide(msr)) { + int r; + mutex_lock(&vcpu->kvm->lock); + r = get_msr_hyperv_pw(vcpu, msr, pdata); + mutex_unlock(&vcpu->kvm->lock); + return r; + } else + return get_msr_hyperv(vcpu, msr, pdata); + break; + case MSR_IA32_BBL_CR_CTL3: + /* This legacy MSR exists but isn't documented in current + * silicon. It is however accessed by winxp in certain + * scenarios where it sets bit #19, itself documented as + * a "reserved" bit. Best effort attempt to source coherent + * read data here should the balance of the register be + * interpreted by the guest: + * + * L2 cache control register 3: 64GB range, 256KB size, + * enabled, latency 0x1, configured + */ + data = 0xbe702111; + break; default: + if (kvm_pmu_msr(vcpu, msr)) + return kvm_pmu_get_msr(vcpu, msr, pdata); if (!ignore_msrs) { pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1137,15 +1858,15 @@ static int __msr_io(struct kvm_vcpu *vcp int (*do_msr)(struct kvm_vcpu *vcpu, unsigned index, u64 *data)) { - int i; + int i, idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); for (i = 0; i < msrs->nmsrs; ++i) if (do_msr(vcpu, entries[i].index, &entries[i].data)) break; - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); vcpu_put(vcpu); @@ -1224,6 +1945,13 @@ int kvm_dev_ioctl_check_extension(long e case KVM_CAP_PIT2: case KVM_CAP_PIT_STATE2: case KVM_CAP_SET_IDENTITY_MAP_ADDR: + case KVM_CAP_ADJUST_CLOCK: + case KVM_CAP_VCPU_EVENTS: + case KVM_CAP_HYPERV: + case KVM_CAP_PCI_SEGMENT: + case KVM_CAP_XSAVE: + case KVM_CAP_GET_TSC_KHZ: + case KVM_CAP_KVMCLOCK_CTRL: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -1233,13 +1961,13 @@ int kvm_dev_ioctl_check_extension(long e r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: - r = KVM_MAX_VCPUS; + r = KVM_MAX_VCPU_COUNT; break; case KVM_CAP_NR_MEMSLOTS: r = KVM_MEMORY_SLOTS; break; - case KVM_CAP_PV_MMU: - r = !tdp_enabled; + case KVM_CAP_PV_MMU: /* obsolete */ + r = 0; break; case KVM_CAP_IOMMU: r = iommu_found(); @@ -1247,6 +1975,15 @@ int kvm_dev_ioctl_check_extension(long e case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; + case KVM_CAP_XCRS: + r = cpu_has_xsave; + break; + case KVM_CAP_TSC_CONTROL: + r = kvm_has_tsc_control; + break; + case KVM_CAP_TSC_DEADLINE_TIMER: + r = boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER); + break; default: r = 0; break; @@ -1326,13 +2063,39 @@ out: void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { kvm_x86_ops->vcpu_load(vcpu, cpu); - kvm_request_guest_time_update(vcpu); + + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { + adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); + vcpu->arch.tsc_offset_adjustment = 0; + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + } + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + /* Make sure TSC doesn't go backwards */ + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; + if (tsc_delta < 0) + WARN_ON(!check_tsc_unstable()); + if (check_tsc_unstable()) { + adjust_tsc_offset_host(vcpu, -tsc_delta); + vcpu->arch.tsc_catchup = 1; + } + set_bit(KVM_REQ_GLOBAL_CLOCK_UPDATE, &vcpu->requests); + if (vcpu->cpu != cpu) + kvm_migrate_timers(vcpu); + vcpu->cpu = cpu; + } + + accumulate_steal_time(vcpu); + set_bit(KVM_REQ_STEAL_UPDATE, &vcpu->requests); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + kvm_x86_ops->vcpu_put(vcpu); } static int is_efer_nx(void) @@ -1381,6 +2144,7 @@ static int kvm_vcpu_ioctl_set_cpuid(stru if (copy_from_user(cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry))) goto out_free; + vcpu_load(vcpu); for (i = 0; i < cpuid->nent; i++) { vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; @@ -1397,6 +2161,9 @@ static int kvm_vcpu_ioctl_set_cpuid(stru cpuid_fix_nx_cap(vcpu); r = 0; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + update_cpuid(vcpu); + vcpu_put(vcpu); out_free: vfree(cpuid_entries); @@ -1417,8 +2184,12 @@ static int kvm_vcpu_ioctl_set_cpuid2(str if (copy_from_user(&vcpu->arch.cpuid_entries, entries, cpuid->nent * sizeof(struct kvm_cpuid_entry2))) goto out; + vcpu_load(vcpu); vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); + kvm_x86_ops->cpuid_update(vcpu); + update_cpuid(vcpu); + vcpu_put(vcpu); return 0; out: @@ -1431,6 +2202,7 @@ static int kvm_vcpu_ioctl_get_cpuid2(str { int r; + vcpu_load(vcpu); r = -E2BIG; if (cpuid->nent < vcpu->arch.cpuid_nent) goto out; @@ -1442,9 +2214,18 @@ static int kvm_vcpu_ioctl_get_cpuid2(str out: cpuid->nent = vcpu->arch.cpuid_nent; + vcpu_put(vcpu); return r; } +static void cpuid_mask(u32 *word, int wordnum) +{ + if (wordnum < NCAPINTS) + *word &= boot_cpu_data.x86_capability[wordnum]; + else + *word &= boot_cpu_data_rh.x86_capability[wordnum - NCAPINTS]; +} + static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index) { @@ -1455,18 +2236,28 @@ static void do_cpuid_1_ent(struct kvm_cp entry->flags = 0; } +static bool supported_xcr0_bit(unsigned bit) +{ + u64 mask = ((u64)1 << bit); + + return mask & (XSTATE_FP | XSTATE_SSE | XSTATE_YMM) & host_xcr0; +} + #define F(x) bit(X86_FEATURE_##x) static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, u32 index, int *nent, int maxnent) { unsigned f_nx = is_efer_nx() ? F(NX) : 0; - unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0; #ifdef CONFIG_X86_64 + unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL) + ? F(GBPAGES) : 0; unsigned f_lm = F(LM); #else + unsigned f_gbpages = 0; unsigned f_lm = 0; #endif + unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; /* cpuid 1.edx */ const u32 kvm_supported_word0_x86_features = @@ -1490,19 +2281,26 @@ static void do_cpuid_ent(struct kvm_cpui 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); /* cpuid 1.ecx */ const u32 kvm_supported_word4_x86_features = - F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ | + F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | - 0 /* Reserved, DCA */ | F(XMM4_1) | + F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | + F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | - 0 /* Reserved, XSAVE, OSXSAVE */; + 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | + F(F16C) | F(RDRAND); /* cpuid 0x80000001.ecx */ const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | - 0 /* SKINIT */ | 0 /* WDT */; + F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + + /* cpuid 7.0.ebx */ + const u32 kvm_supported_word9_x86_features = + F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) | + F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | F(RDSEED) | + F(ADX); /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -1511,11 +2309,13 @@ static void do_cpuid_ent(struct kvm_cpui switch (function) { case 0: - entry->eax = min(entry->eax, (u32)0xb); + entry->eax = min(entry->eax, (u32)0xd); break; case 1: entry->edx &= kvm_supported_word0_x86_features; + cpuid_mask(&entry->edx, 0); entry->ecx &= kvm_supported_word4_x86_features; + cpuid_mask(&entry->ecx, 4); /* we support x2apic emulation even if host does not support * it since we emulate x2apic in software */ entry->ecx |= F(X2APIC); @@ -1536,7 +2336,7 @@ static void do_cpuid_ent(struct kvm_cpui } break; } - /* function 4 and 0xb have additional index. */ + /* function 4 has additional index. */ case 4: { int i, cache_type; @@ -1553,6 +2353,51 @@ static void do_cpuid_ent(struct kvm_cpui } break; } + case 7: { + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + /* Mask ebx against host capbability word 9 */ + if (index == 0) { + entry->ebx &= kvm_supported_word9_x86_features; + cpuid_mask(&entry->ebx, 9); + } else + entry->ebx = 0; + entry->eax = 0; + entry->ecx = 0; + entry->edx = 0; + break; + } + case 9: + break; + case 0xa: { /* Architectural Performance Monitoring */ + struct x86_pmu_capability cap; + union cpuid10_eax eax; + union cpuid10_edx edx; + + perf_get_x86_pmu_capability(&cap); + + /* + * Only support guest architectural pmu on a host + * with architectural pmu. + */ + if (!cap.version) + memset(&cap, 0, sizeof(cap)); + + eax.split.version_id = min(cap.version, 2); + eax.split.num_counters = cap.num_counters_gp; + eax.split.bit_width = cap.bit_width_gp; + eax.split.mask_length = cap.events_mask_len; + + edx.split.num_counters_fixed = cap.num_counters_fixed; + edx.split.bit_width_fixed = cap.bit_width_fixed; + edx.split.reserved = 0; + + entry->eax = eax.full; + entry->ebx = cap.events_mask; + entry->ecx = 0; + entry->edx = edx.full; + break; + } + /* function 0xb has additional index. */ case 0xb: { int i, level_type; @@ -1569,12 +2414,83 @@ static void do_cpuid_ent(struct kvm_cpui } break; } + case 0xd: { + int idx, i; + + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + for (idx = 1, i = 1; *nent < maxnent && idx < 64; ++idx) { + do_cpuid_1_ent(&entry[i], function, idx); + if (entry[i].eax == 0 || !supported_xcr0_bit(idx)) + continue; + entry[i].flags |= + KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + ++*nent; + ++i; + } + break; + } + case KVM_CPUID_SIGNATURE: { + char signature[12] = "KVMKVMKVM\0\0"; + u32 *sigptr = (u32 *)signature; + entry->eax = 0; + entry->ebx = sigptr[0]; + entry->ecx = sigptr[1]; + entry->edx = sigptr[2]; + break; + } + case KVM_CPUID_FEATURES: + entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | + (1 << KVM_FEATURE_NOP_IO_DELAY) | + (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_PV_EOI) | + (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); + + if (sched_info_on()) + entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); + + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; case 0x80000000: entry->eax = min(entry->eax, 0x8000001a); break; case 0x80000001: entry->edx &= kvm_supported_word1_x86_features; + cpuid_mask(&entry->edx, 1); entry->ecx &= kvm_supported_word6_x86_features; + cpuid_mask(&entry->ecx, 6); + break; + case 0x80000007: /* Advanced power management */ + /* invariant TSC is CPUID.80000007H:EDX[8] */ + entry->edx &= 1 << 8; + /* mask against host */ + entry->edx &= boot_cpu_data.x86_power; + entry->eax = entry->ebx = entry->ecx = 0; + break; + case 0x80000008: { + unsigned g_phys_as = (entry->eax >> 16) & 0xff; + unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U); + unsigned phys_as = entry->eax & 0xff; + + if (!g_phys_as) + g_phys_as = phys_as; + entry->eax = g_phys_as | (virt_as << 8); + entry->ebx = entry->edx = 0; + break; + } + case 0x80000019: + entry->ecx = entry->edx = 0; + break; + case 0x8000001a: + break; + case 0x8000001d: + break; + case 3: /* Processor serial number */ + case 5: /* MONITOR/MWAIT */ + case 6: /* Thermal management */ + default: + entry->eax = entry->ebx = entry->ecx = entry->edx = 0; break; } put_cpu(); @@ -1612,6 +2528,23 @@ static int kvm_dev_ioctl_get_supported_c for (func = 0x80000001; func <= limit && nent < cpuid->nent; ++func) do_cpuid_ent(&cpuid_entries[nent], func, 0, &nent, cpuid->nent); + + + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, + cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_FEATURES, 0, &nent, + cpuid->nent); + r = -E2BIG; if (nent >= cpuid->nent) goto out_free; @@ -1691,6 +2624,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce( int r; unsigned bank_num = mcg_cap & 0xff, bank; + vcpu_load(vcpu); r = -EINVAL; if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS) goto out; @@ -1705,6 +2639,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce( for (bank = 0; bank < bank_num; bank++) vcpu->arch.mce_banks[bank*4] = ~(u64)0; out: + vcpu_put(vcpu); return r; } @@ -1733,7 +2668,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(st return 0; if (mce->status & MCI_STATUS_UC) { if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || - !(vcpu->arch.cr4 & X86_CR4_MCE)) { + !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { printk(KERN_DEBUG "kvm: set_mce: " "injects mce exception while " "previous one is in progress!\n"); @@ -1759,6 +2694,156 @@ static int kvm_vcpu_ioctl_x86_set_mce(st return 0; } +static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + vcpu_load(vcpu); + + events->exception.injected = vcpu->arch.exception.pending; + events->exception.nr = vcpu->arch.exception.nr; + events->exception.has_error_code = vcpu->arch.exception.has_error_code; + events->exception.pad = 0; + events->exception.error_code = vcpu->arch.exception.error_code; + + events->interrupt.injected = vcpu->arch.interrupt.pending; + events->interrupt.nr = vcpu->arch.interrupt.nr; + events->interrupt.soft = vcpu->arch.interrupt.soft; + events->interrupt.pad = 0; + + events->nmi.injected = vcpu->arch.nmi_injected; + events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); + events->nmi.pad = 0; + + events->sipi_vector = vcpu->arch.sipi_vector; + + events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR); + + memset(&events->reserved, 0, sizeof(events->reserved)); + vcpu_put(vcpu); +} + +static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, + struct kvm_vcpu_events *events) +{ + if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING + | KVM_VCPUEVENT_VALID_SIPI_VECTOR)) + return -EINVAL; + + vcpu_load(vcpu); + + vcpu->arch.exception.pending = events->exception.injected; + vcpu->arch.exception.nr = events->exception.nr; + vcpu->arch.exception.has_error_code = events->exception.has_error_code; + vcpu->arch.exception.error_code = events->exception.error_code; + + vcpu->arch.interrupt.pending = events->interrupt.injected; + vcpu->arch.interrupt.nr = events->interrupt.nr; + vcpu->arch.interrupt.soft = events->interrupt.soft; + if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm)) + kvm_pic_clear_isr_ack(vcpu->kvm); + + vcpu->arch.nmi_injected = events->nmi.injected; + if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) + vcpu->arch.nmi_pending = events->nmi.pending; + kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked); + + if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) + vcpu->arch.sipi_vector = events->sipi_vector; + + vcpu_put(vcpu); + + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + if (cpu_has_xsave) + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->xsave, + xstate_size); + else { + memcpy(guest_xsave->region, + &vcpu->arch.guest_fpu.state->fxsave, + sizeof(struct i387_fxsave_struct)); + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] = + XSTATE_FPSSE; + } +} + +static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, + struct kvm_xsave *guest_xsave) +{ + u64 xstate_bv = + *(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)]; + + if (cpu_has_xsave) + memcpy(&vcpu->arch.guest_fpu.state->xsave, + guest_xsave->region, xstate_size); + else { + if (xstate_bv & ~XSTATE_FPSSE) + return -EINVAL; + memcpy(&vcpu->arch.guest_fpu.state->fxsave, + guest_xsave->region, sizeof(struct i387_fxsave_struct)); + } + return 0; +} + +static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + if (!cpu_has_xsave) { + guest_xcrs->nr_xcrs = 0; + return; + } + + guest_xcrs->nr_xcrs = 1; + guest_xcrs->flags = 0; + guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; + guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; +} + +static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, + struct kvm_xcrs *guest_xcrs) +{ + int i, r = 0; + + if (!cpu_has_xsave) + return -EINVAL; + + if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) + return -EINVAL; + + for (i = 0; i < guest_xcrs->nr_xcrs; i++) + /* Only support XCR0 currently */ + if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) { + r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, + guest_xcrs->xcrs[0].value); + break; + } + if (r) + r = -EINVAL; + return r; +} + +/* + * kvm_set_guest_paused() indicates to the guest kernel that it has been + * stopped by the hypervisor. This function will be called from the host only. + * EINVAL is returned when the host attempts to set the flag for a guest that + * does not support pv clocks. + */ +static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.pv_time_enabled) + return -EINVAL; + vcpu->arch.pvclock_set_guest_stopped_request = true; + mark_page_dirty(vcpu->kvm, vcpu->arch.time >> PAGE_SHIFT); + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + return 0; +} + long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { @@ -1766,9 +2851,14 @@ long kvm_arch_vcpu_ioctl(struct file *fi void __user *argp = (void __user *)arg; int r; struct kvm_lapic_state *lapic = NULL; + struct kvm_xsave *xsave = NULL; + struct kvm_xcrs *xcrs = NULL; switch (ioctl) { case KVM_GET_LAPIC: { + r = -EINVAL; + if (!vcpu->arch.apic) + goto out; lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; @@ -1784,6 +2874,9 @@ long kvm_arch_vcpu_ioctl(struct file *fi break; } case KVM_SET_LAPIC: { + r = -EINVAL; + if (!vcpu->arch.apic) + goto out; lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); r = -ENOMEM; if (!lapic) @@ -1882,39 +2975,149 @@ long kvm_arch_vcpu_ioctl(struct file *fi case KVM_SET_VAPIC_ADDR: { struct kvm_vapic_addr va; - r = -EINVAL; - if (!irqchip_in_kernel(vcpu->kvm)) - goto out; + r = -EINVAL; + if (!irqchip_in_kernel(vcpu->kvm)) + goto out; + r = -EFAULT; + if (copy_from_user(&va, argp, sizeof va)) + goto out; + r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); + break; + } + case KVM_X86_SETUP_MCE: { + u64 mcg_cap; + + r = -EFAULT; + if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) + goto out; + r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); + break; + } + case KVM_X86_SET_MCE: { + struct kvm_x86_mce mce; + + r = -EFAULT; + if (copy_from_user(&mce, argp, sizeof mce)) + goto out; + vcpu_load(vcpu); + r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); + vcpu_put(vcpu); + break; + } + case KVM_GET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events); + + r = -EFAULT; + if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events))) + break; + r = 0; + break; + } + case KVM_SET_VCPU_EVENTS: { + struct kvm_vcpu_events events; + + r = -EFAULT; + if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events))) + break; + + r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events); + break; + } + case KVM_GET_XSAVE: { + xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!xsave) + break; + + kvm_vcpu_ioctl_x86_get_xsave(vcpu, xsave); + + r = -EFAULT; + if (copy_to_user(argp, xsave, sizeof(struct kvm_xsave))) + break; + r = 0; + break; + } + case KVM_SET_XSAVE: { + xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL); + r = -ENOMEM; + if (!xsave) + break; + + r = -EFAULT; + if (copy_from_user(xsave, argp, sizeof(struct kvm_xsave))) + break; + + r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, xsave); + break; + } + case KVM_GET_XCRS: { + xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!xcrs) + break; + + kvm_vcpu_ioctl_x86_get_xcrs(vcpu, xcrs); + r = -EFAULT; - if (copy_from_user(&va, argp, sizeof va)) - goto out; + if (copy_to_user(argp, xcrs, + sizeof(struct kvm_xcrs))) + break; r = 0; - kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr); break; } - case KVM_X86_SETUP_MCE: { - u64 mcg_cap; + case KVM_SET_XCRS: { + xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL); + r = -ENOMEM; + if (!xcrs) + break; r = -EFAULT; - if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) - goto out; - r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); + if (copy_from_user(xcrs, argp, + sizeof(struct kvm_xcrs))) + break; + + r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, xcrs); break; } - case KVM_X86_SET_MCE: { - struct kvm_x86_mce mce; + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; - r = -EFAULT; - if (copy_from_user(&mce, argp, sizeof mce)) + r = -EINVAL; + if (!kvm_has_tsc_control) + break; + + user_tsc_khz = (u32)arg; + + if (user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; - r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); - break; + + kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + + r = 0; + goto out; + } + case KVM_GET_TSC_KHZ: { + r = -EIO; + if (check_tsc_unstable()) + goto out; + + r = vcpu_tsc_khz(vcpu); + + goto out; + } + case KVM_KVMCLOCK_CTRL: { + r = kvm_set_guest_paused(vcpu); + goto out; } default: r = -EINVAL; } out: kfree(lapic); + kfree(xsave); + kfree(xcrs); return r; } @@ -1941,29 +3144,51 @@ static int kvm_vm_ioctl_set_nr_mmu_pages if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); spin_lock(&kvm->mmu_lock); kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; spin_unlock(&kvm->mmu_lock); - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return 0; } static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { - return kvm->arch.n_alloc_mmu_pages; + return kvm->arch.n_max_mmu_pages; +} + +gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn) +{ + int i; + struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; + + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; + if (alias->flags & KVM_ALIAS_INVALID) + continue; + if (gfn >= alias->base_gfn + && gfn < alias->base_gfn + alias->npages) + return alias->target_gfn + gfn - alias->base_gfn; + } + return gfn; } gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) { int i; struct kvm_mem_alias *alias; + struct kvm_mem_aliases *aliases; - for (i = 0; i < kvm->arch.naliases; ++i) { - alias = &kvm->arch.aliases[i]; + aliases = rcu_dereference(kvm->arch.aliases); + + for (i = 0; i < aliases->naliases; ++i) { + alias = &aliases->aliases[i]; if (gfn >= alias->base_gfn && gfn < alias->base_gfn + alias->npages) return alias->target_gfn + gfn - alias->base_gfn; @@ -1981,6 +3206,7 @@ static int kvm_vm_ioctl_set_memory_alias { int r, n; struct kvm_mem_alias *p; + struct kvm_mem_aliases *aliases, *old_aliases; r = -EINVAL; /* General sanity checks */ @@ -1997,26 +3223,48 @@ static int kvm_vm_ioctl_set_memory_alias < alias->target_phys_addr) goto out; - down_write(&kvm->slots_lock); - spin_lock(&kvm->mmu_lock); + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out; + + mutex_lock(&kvm->slots_lock); + + /* invalidate any gfn reference in case of deletion/shrinking */ + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); + aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kvm_mmu_zap_all(kvm); + kfree(old_aliases); + + r = -ENOMEM; + aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!aliases) + goto out_unlock; + + memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases)); - p = &kvm->arch.aliases[alias->slot]; + p = &aliases->aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; p->npages = alias->memory_size >> PAGE_SHIFT; p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; + p->flags &= ~(KVM_ALIAS_INVALID); for (n = KVM_ALIAS_SLOTS; n > 0; --n) - if (kvm->arch.aliases[n - 1].npages) + if (aliases->aliases[n - 1].npages) break; - kvm->arch.naliases = n; + aliases->naliases = n; - spin_unlock(&kvm->mmu_lock); - kvm_mmu_zap_all(kvm); - - up_write(&kvm->slots_lock); - - return 0; + old_aliases = kvm->arch.aliases; + rcu_assign_pointer(kvm->arch.aliases, aliases); + synchronize_srcu_expedited(&kvm->srcu); + kfree(old_aliases); + r = 0; +out_unlock: + mutex_unlock(&kvm->slots_lock); out: return r; } @@ -2038,9 +3286,7 @@ static int kvm_vm_ioctl_get_irqchip(stru sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - memcpy(&chip->chip.ioapic, - ioapic_irqchip(kvm), - sizeof(struct kvm_ioapic_state)); + r = kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -2070,11 +3316,7 @@ static int kvm_vm_ioctl_set_irqchip(stru spin_unlock(&pic_irqchip(kvm)->lock); break; case KVM_IRQCHIP_IOAPIC: - mutex_lock(&kvm->irq_lock); - memcpy(ioapic_irqchip(kvm), - &chip->chip.ioapic, - sizeof(struct kvm_ioapic_state)); - mutex_unlock(&kvm->irq_lock); + r = kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -2114,6 +3356,7 @@ static int kvm_vm_ioctl_get_pit2(struct sizeof(ps->channels)); ps->flags = kvm->arch.vpit->pit_state.flags; mutex_unlock(&kvm->arch.vpit->pit_state.lock); + memset(&ps->reserved, 0, sizeof(ps->reserved)); return r; } @@ -2151,29 +3394,64 @@ static int kvm_vm_ioctl_reinject(struct int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) { - int r; - int n; + int r, i; struct kvm_memory_slot *memslot; - int is_dirty = 0; + unsigned long n; + unsigned long is_dirty = 0; + unsigned long *dirty_bitmap = NULL; - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); - r = kvm_get_dirty_log(kvm, log, &is_dirty); - if (r) + r = -EINVAL; + if (log->slot >= KVM_MEMORY_SLOTS) + goto out; + + memslot = &kvm->memslots->memslots[log->slot]; + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + + r = -ENOMEM; + dirty_bitmap = vmalloc(n); + if (!dirty_bitmap) goto out; + memset(dirty_bitmap, 0, n); + + for (i = 0; !is_dirty && i < n/sizeof(long); i++) + is_dirty = memslot->dirty_bitmap[i]; /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { + struct kvm_memslots *slots, *old_slots; + + slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); + if (!slots) + goto out_free; + + memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); + slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; + slots->generation++; + + old_slots = kvm->memslots; + rcu_assign_pointer(kvm->memslots, slots); + synchronize_srcu_expedited(&kvm->srcu); + dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; + kfree(old_slots); + spin_lock(&kvm->mmu_lock); - kvm_mmu_slot_remove_write_access(kvm, log->slot); + kvm_mmu_slot_remove_write_access(kvm, log->slot, false); spin_unlock(&kvm->mmu_lock); - memslot = &kvm->memslots[log->slot]; - n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; - memset(memslot->dirty_bitmap, 0, n); } + r = 0; + if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) + r = -EFAULT; +out_free: + vfree(dirty_bitmap); out: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); return r; } @@ -2244,25 +3522,42 @@ long kvm_arch_vm_ioctl(struct file *filp if (r) goto out; break; - case KVM_CREATE_IRQCHIP: + case KVM_CREATE_IRQCHIP: { + struct kvm_pic *vpic; + + mutex_lock(&kvm->lock); + r = -EEXIST; + if (kvm->arch.vpic) + goto create_irqchip_unlock; + r = -EINVAL; + if (atomic_read(&kvm->online_vcpus)) + goto create_irqchip_unlock; r = -ENOMEM; - kvm->arch.vpic = kvm_create_pic(kvm); - if (kvm->arch.vpic) { + vpic = kvm_create_pic(kvm); + if (vpic) { r = kvm_ioapic_init(kvm); if (r) { - kfree(kvm->arch.vpic); - kvm->arch.vpic = NULL; - goto out; + kfree(vpic); + goto create_irqchip_unlock; } } else - goto out; + goto create_irqchip_unlock; + smp_wmb(); + kvm->arch.vpic = vpic; + smp_wmb(); r = kvm_setup_default_irq_routing(kvm); if (r) { + mutex_lock(&kvm->irq_lock); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); - goto out; + kvm->arch.vpic = NULL; + kvm->arch.vioapic = NULL; + mutex_unlock(&kvm->irq_lock); } + create_irqchip_unlock: + mutex_unlock(&kvm->lock); break; + } case KVM_CREATE_PIT: u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; goto create_pit; @@ -2272,7 +3567,7 @@ long kvm_arch_vm_ioctl(struct file *filp sizeof(struct kvm_pit_config))) goto out; create_pit: - down_write(&kvm->slots_lock); + mutex_lock(&kvm->slots_lock); r = -EEXIST; if (kvm->arch.vpit) goto create_pit_unlock; @@ -2281,7 +3576,7 @@ long kvm_arch_vm_ioctl(struct file *filp if (kvm->arch.vpit) r = 0; create_pit_unlock: - up_write(&kvm->slots_lock); + mutex_unlock(&kvm->slots_lock); break; case KVM_IRQ_LINE_STATUS: case KVM_IRQ_LINE: { @@ -2292,10 +3587,8 @@ long kvm_arch_vm_ioctl(struct file *filp goto out; if (irqchip_in_kernel(kvm)) { __s32 status; - mutex_lock(&kvm->irq_lock); status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irq_event.irq, irq_event.level); - mutex_unlock(&kvm->irq_lock); if (ioctl == KVM_IRQ_LINE_STATUS) { irq_event.status = status; if (copy_to_user(argp, &irq_event, @@ -2421,6 +3714,46 @@ long kvm_arch_vm_ioctl(struct file *filp r = 0; break; } + case KVM_SET_CLOCK: { + struct kvm_clock_data user_ns; + u64 now_ns; + s64 delta; + + r = -EFAULT; + if (copy_from_user(&user_ns, argp, sizeof(user_ns))) + goto out; + + r = -EINVAL; + if (user_ns.flags) + goto out; + + r = 0; + local_irq_disable(); + now_ns = get_kernel_ns(); + delta = user_ns.clock - now_ns; + local_irq_enable(); + kvm->arch.kvmclock_offset = delta; + break; + } + case KVM_GET_CLOCK: { + struct kvm_clock_data user_ns; + u64 now_ns; + + local_irq_disable(); + now_ns = get_kernel_ns(); + user_ns.clock = kvm->arch.kvmclock_offset + now_ns; + local_irq_enable(); + user_ns.flags = 0; + memset(&user_ns.pad, 0, sizeof(user_ns.pad)); + + r = -EFAULT; + if (copy_to_user(argp, &user_ns, sizeof(user_ns))) + goto out; + + r = 0; + break; + } + default: ; } @@ -2433,7 +3766,8 @@ static void kvm_init_msr_list(void) u32 dummy[2]; unsigned i, j; - for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { + /* skip the first msrs in the list. KVM-specific */ + for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) { if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) continue; if (j < i) @@ -2450,7 +3784,7 @@ static int vcpu_mmio_write(struct kvm_vc !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) @@ -2459,17 +3793,44 @@ static int vcpu_mmio_read(struct kvm_vcp !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) return 0; - return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); + return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + + gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_FETCH_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + access |= PFERR_WRITE_MASK; + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); +} + +/* uses this to access any guest's mapped memory without checking CPL */ +gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) +{ + return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); +} + +static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2492,14 +3853,41 @@ out: return r; } -static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu) +/* used for instruction fetching */ +static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, + access | PFERR_FETCH_MASK, error); +} + +static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, + error); +} + +static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} + +static int kvm_write_guest_virt_helper(gva_t addr, void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu, u32 access, + u32 *error) { void *data = val; int r = X86EMUL_CONTINUE; + access |= PFERR_WRITE_MASK; + while (bytes) { - gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); unsigned offset = addr & (PAGE_SIZE-1); unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); int ret; @@ -2522,6 +3910,19 @@ out: return r; } +static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, access, error); +} + +static int kvm_write_guest_virt_system(gva_t addr, void *val, + unsigned int bytes, + struct kvm_vcpu *vcpu, u32 *error) +{ + return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, 0, error); +} static int emulator_read_emulated(unsigned long addr, void *val, @@ -2529,6 +3930,7 @@ static int emulator_read_emulated(unsign struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -2538,17 +3940,20 @@ static int emulator_read_emulated(unsign return X86EMUL_CONTINUE; } - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code); + + if (gpa == UNMAPPED_GVA) { + kvm_inject_page_fault(vcpu, addr, error_code); + return X86EMUL_PROPAGATE_FAULT; + } /* For APIC access vmexit */ if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu) + if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; - if (gpa == UNMAPPED_GVA) - return X86EMUL_PROPAGATE_FAULT; mmio: /* @@ -2587,11 +3992,12 @@ static int emulator_write_emulated_onepa struct kvm_vcpu *vcpu) { gpa_t gpa; + u32 error_code; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code); if (gpa == UNMAPPED_GVA) { - kvm_inject_page_fault(vcpu, addr, 2); + kvm_inject_page_fault(vcpu, addr, error_code); return X86EMUL_PROPAGATE_FAULT; } @@ -2655,7 +4061,7 @@ static int emulator_cmpxchg_emulated(uns char *kaddr; u64 val; - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); + gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); if (gpa == UNMAPPED_GVA || (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) @@ -2692,7 +4098,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu int emulate_clts(struct kvm_vcpu *vcpu) { - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); return X86EMUL_CONTINUE; } @@ -2700,6 +4106,9 @@ int emulator_get_dr(struct x86_emulate_c { struct kvm_vcpu *vcpu = ctxt->vcpu; + if (!kvm_x86_ops->get_dr) + return X86EMUL_UNHANDLEABLE; + switch (dr) { case 0 ... 3: *dest = kvm_x86_ops->get_dr(vcpu, dr); @@ -2715,6 +4124,9 @@ int emulator_set_dr(struct x86_emulate_c unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; int exception; + if (!kvm_x86_ops->set_dr) + return X86EMUL_UNHANDLEABLE; + kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); if (exception) { /* FIXME: better handling */ @@ -2734,18 +4146,41 @@ void kvm_report_emulation_failure(struct rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); + kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); } EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); +static bool emulator_get_cpuid(struct kvm_vcpu *vcpu, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + struct kvm_cpuid_entry2 *cpuid = NULL; + + if (eax && ecx) + cpuid = kvm_find_cpuid_entry(vcpu, *eax, *ecx); + + if (cpuid) { + *eax = cpuid->eax; + *ecx = cpuid->ecx; + if (ebx) + *ebx = cpuid->ebx; + if (edx) + *edx = cpuid->edx; + return true; + } + + return false; +} + static struct x86_emulate_ops emulate_ops = { - .read_std = kvm_read_guest_virt, + .read_std = kvm_read_guest_virt_system, + .fetch = kvm_fetch_guest_virt, .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, + .get_cpuid = emulator_get_cpuid, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -2756,14 +4191,40 @@ static void cache_all_regs(struct kvm_vc vcpu->arch.regs_dirty = ~0; } -int emulate_instruction(struct kvm_vcpu *vcpu, - struct kvm_run *run, +static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) +{ + gpa_t gpa; + + if (tdp_enabled) + return false; + /* + * if emulation was due to access to shadowed page table + * and it failed try to unshadow page and re-entetr the + * guest to let CPU execute the instruction. + */ + if (kvm_mmu_unprotect_page_virt(vcpu, gva)) + return true; + + gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL); + + if (gpa == UNMAPPED_GVA) + return true; /* let cpu generate fault */ + + if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT))) + return true; + + return false; +} + +int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2, - u16 error_code, - int emulation_type) + int emulation_type, + void *insn, + int insn_len) { int r, shadow_mask; struct decode_cache *c; + struct kvm_run *run = vcpu->run; kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; @@ -2785,12 +4246,14 @@ int emulate_instruction(struct kvm_vcpu vcpu->arch.emulate_ctxt.vcpu = vcpu; vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); vcpu->arch.emulate_ctxt.mode = + (!(vcpu->arch.cr0 & X86_CR0_PE)) ? X86EMUL_MODE_REAL : (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) - ? X86EMUL_MODE_REAL : cs_l + ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); + r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops, + insn, insn_len); /* Only allow emulation of specific instructions on #UD * (namely VMMCALL, sysenter, sysexit, syscall)*/ @@ -2823,7 +4286,7 @@ int emulate_instruction(struct kvm_vcpu ++vcpu->stat.insn_emulation; if (r) { ++vcpu->stat.insn_emulation_fail; - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; return EMULATE_FAIL; } @@ -2852,7 +4315,7 @@ int emulate_instruction(struct kvm_vcpu } if (r) { - if (kvm_mmu_unprotect_page_virt(vcpu, cr2)) + if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; if (!vcpu->mmio_needed) { kvm_report_emulation_failure(vcpu, "mmio"); @@ -2870,7 +4333,7 @@ int emulate_instruction(struct kvm_vcpu return EMULATE_DONE; } -EXPORT_SYMBOL_GPL(emulate_instruction); +EXPORT_SYMBOL_GPL(x86_emulate_instruction); static int pio_copy_data(struct kvm_vcpu *vcpu) { @@ -2878,12 +4341,17 @@ static int pio_copy_data(struct kvm_vcpu gva_t q = vcpu->arch.pio.guest_gva; unsigned bytes; int ret; + u32 error_code; bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; if (vcpu->arch.pio.in) - ret = kvm_write_guest_virt(q, p, bytes, vcpu); + ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code); else - ret = kvm_read_guest_virt(q, p, bytes, vcpu); + ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code); + + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, q, error_code); + return ret; } @@ -2904,7 +4372,7 @@ int complete_pio(struct kvm_vcpu *vcpu) if (io->in) { r = pio_copy_data(vcpu); if (r) - return r; + goto out; } delta = 1; @@ -2931,7 +4399,7 @@ int complete_pio(struct kvm_vcpu *vcpu) kvm_register_write(vcpu, VCPU_REGS_RSI, val); } } - +out: io->count -= io->cur_count; io->cur_count = 0; @@ -2944,11 +4412,12 @@ static int kernel_pio(struct kvm_vcpu *v int r; if (vcpu->arch.pio.in) - r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, + r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port, vcpu->arch.pio.size, pd); else - r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, - vcpu->arch.pio.size, pd); + r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, + vcpu->arch.pio.port, vcpu->arch.pio.size, + pd); return r; } @@ -2959,7 +4428,7 @@ static int pio_string_write(struct kvm_v int i, r = 0; for (i = 0; i < io->cur_count; i++) { - if (kvm_io_bus_write(&vcpu->kvm->pio_bus, + if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS, io->port, io->size, pd)) { r = -EOPNOTSUPP; break; @@ -2969,11 +4438,12 @@ static int pio_string_write(struct kvm_v return r; } -int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, - int size, unsigned port) +int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port) { unsigned long val; + trace_kvm_pio(!in, port, size, 1); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -2985,9 +4455,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcp vcpu->arch.pio.down = 0; vcpu->arch.pio.rep = 0; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, 1); - val = kvm_register_read(vcpu, VCPU_REGS_RAX); memcpy(vcpu->arch.pio_data, &val, 4); @@ -2999,13 +4466,15 @@ int kvm_emulate_pio(struct kvm_vcpu *vcp } EXPORT_SYMBOL_GPL(kvm_emulate_pio); -int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, +int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in, int size, unsigned long count, int down, gva_t address, int rep, unsigned port) { unsigned now, in_page; int ret = 0; + trace_kvm_pio(!in, port, size, count); + vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; vcpu->run->io.size = vcpu->arch.pio.size = size; @@ -3017,9 +4486,6 @@ int kvm_emulate_pio_string(struct kvm_vc vcpu->arch.pio.down = down; vcpu->arch.pio.rep = rep; - trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, - size, count); - if (!count) { kvm_x86_ops->skip_emulated_instruction(vcpu); return 1; @@ -3051,10 +4517,8 @@ int kvm_emulate_pio_string(struct kvm_vc if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); - if (ret == X86EMUL_PROPAGATE_FAULT) { - kvm_inject_gp(vcpu, 0); + if (ret == X86EMUL_PROPAGATE_FAULT) return 1; - } if (ret == 0 && !pio_string_write(vcpu)) { complete_pio(vcpu); if (vcpu->arch.pio.count == 0) @@ -3067,13 +4531,24 @@ int kvm_emulate_pio_string(struct kvm_vc } EXPORT_SYMBOL_GPL(kvm_emulate_pio_string); -static void bounce_off(void *info) +static void tsc_bad(void *info) { - /* nothing */ + __get_cpu_var(cpu_tsc_khz) = 0; } -static unsigned int ref_freq; -static unsigned long tsc_khz_ref; +static void tsc_khz_changed(void *data) +{ + struct cpufreq_freqs *freq = data; + unsigned long khz = 0; + + if (data) + khz = freq->new; + else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + khz = cpufreq_quick_get(raw_smp_processor_id()); + if (!khz) + khz = tsc_khz; + __get_cpu_var(cpu_tsc_khz) = khz; +} static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -3083,24 +4558,60 @@ static int kvmclock_cpufreq_notifier(str struct kvm_vcpu *vcpu; int i, send_ipi = 0; - if (!ref_freq) - ref_freq = freq->old; + /* + * We allow guests to temporarily run on slowing clocks, + * provided we notify them after, or to run on accelerating + * clocks, provided we notify them before. Thus time never + * goes backwards. + * + * However, we have a problem. We can't atomically update + * the frequency of a given CPU from this function; it is + * merely a notifier, which can be called from any CPU. + * Changing the TSC frequency at arbitrary points in time + * requires a recomputation of local variables related to + * the TSC for each VCPU. We must flag these local variables + * to be updated and be sure the update takes place with the + * new frequency before any guests proceed. + * + * Unfortunately, the combination of hotplug CPU and frequency + * change creates an intractable locking scenario; the order + * of when these callouts happen is undefined with respect to + * CPU hotplug, and they can race with each other. As such, + * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is + * undefined; you can actually have a CPU frequency change take + * place in between the computation of X and the setting of the + * variable. To protect against this problem, all updates of + * the per_cpu tsc_khz variable are done in an interrupt + * protected IPI, and all callers wishing to update the value + * must wait for a synchronous IPI to complete (which is trivial + * if the caller is on the CPU already). This establishes the + * necessary total order on variable updates. + * + * Note that because a guest time update may take place + * anytime after the setting of the VCPU's request bit, the + * correct TSC value must be set before the request. However, + * to ensure the update actually makes it to any guest which + * starts running in hardware virtualization between the set + * and the acquisition of the spinlock, we must also ping the + * CPU after setting the request bit. + * + */ if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) return 0; if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) return 0; - per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); + + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); spin_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (vcpu->cpu != freq->cpu) continue; - if (!kvm_request_guest_time_update(vcpu)) - continue; + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); if (vcpu->cpu != smp_processor_id()) - send_ipi++; + send_ipi = 1; } } spin_unlock(&kvm_lock); @@ -3118,18 +4629,111 @@ static int kvmclock_cpufreq_notifier(str * guest context is entered kvmclock will be updated, * so the guest will not see stale values. */ - smp_call_function_single(freq->cpu, bounce_off, NULL, 1); + smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1); } return 0; } static struct notifier_block kvmclock_cpufreq_notifier_block = { - .notifier_call = kvmclock_cpufreq_notifier + .notifier_call = kvmclock_cpufreq_notifier +}; + +static int kvmclock_cpu_notifier(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, tsc_bad, NULL, 1); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block kvmclock_cpu_notifier_block = { + .notifier_call = kvmclock_cpu_notifier, + .priority = -INT_MAX +}; + +static void kvm_timer_init(void) +{ + int cpu; + + max_tsc_khz = tsc_khz; +#ifdef CONFIG_HOTPLUG_CPU + register_hotcpu_notifier(&kvmclock_cpu_notifier_block); +#endif + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { +#ifdef CONFIG_CPU_FREQ + struct cpufreq_policy policy; + memset(&policy, 0, sizeof(policy)); + cpu = get_cpu(); + cpufreq_get_policy(&policy, cpu); + if (policy.cpuinfo.max_freq) + max_tsc_khz = policy.cpuinfo.max_freq; + put_cpu(); +#endif + cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); + } + pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, tsc_khz_changed, NULL, 1); +} + +static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); + +int kvm_is_in_guest(void) +{ + return percpu_read(current_vcpu) != NULL; +} + +static int kvm_is_user_mode(void) +{ + int user_mode = 3; + + if (percpu_read(current_vcpu)) + user_mode = kvm_x86_ops->get_cpl(percpu_read(current_vcpu)); + + return user_mode != 0; +} + +static unsigned long kvm_get_guest_ip(void) +{ + unsigned long ip = 0; + + if (percpu_read(current_vcpu)) + ip = kvm_rip_read(percpu_read(current_vcpu)); + + return ip; +} + +static struct perf_guest_info_callbacks kvm_guest_cbs = { + .is_in_guest = kvm_is_in_guest, + .is_user_mode = kvm_is_user_mode, + .get_guest_ip = kvm_get_guest_ip, }; +void kvm_before_handle_nmi(struct kvm_vcpu *vcpu) +{ + percpu_write(current_vcpu, vcpu); +} +EXPORT_SYMBOL_GPL(kvm_before_handle_nmi); + +void kvm_after_handle_nmi(struct kvm_vcpu *vcpu) +{ + percpu_write(current_vcpu, NULL); +} +EXPORT_SYMBOL_GPL(kvm_after_handle_nmi); + int kvm_arch_init(void *opaque) { - int r, cpu; + int r; struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; if (kvm_x86_ops) { @@ -3161,13 +4765,12 @@ int kvm_arch_init(void *opaque) kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, PT_DIRTY_MASK, PT64_NX_MASK, 0); - for_each_possible_cpu(cpu) - per_cpu(cpu_tsc_khz, cpu) = tsc_khz; - if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { - tsc_khz_ref = tsc_khz; - cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); - } + kvm_timer_init(); + + perf_register_guest_info_callbacks(&kvm_guest_cbs); + + if (cpu_has_xsave) + host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); return 0; @@ -3177,9 +4780,14 @@ out: void kvm_arch_exit(void) { + perf_unregister_guest_info_callbacks(&kvm_guest_cbs); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block); +#endif kvm_x86_ops = NULL; kvm_mmu_module_exit(); } @@ -3302,10 +4910,9 @@ unsigned long realmode_get_cr(struct kvm { unsigned long value; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: - value = vcpu->arch.cr0; + value = kvm_read_cr0(vcpu); break; case 2: value = vcpu->arch.cr2; @@ -3314,7 +4921,7 @@ unsigned long realmode_get_cr(struct kvm value = vcpu->arch.cr3; break; case 4: - value = vcpu->arch.cr4; + value = kvm_read_cr4(vcpu); break; case 8: value = kvm_get_cr8(vcpu); @@ -3327,22 +4934,24 @@ unsigned long realmode_get_cr(struct kvm return value; } -void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, +int realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, unsigned long *rflags) { + int res = 0; + switch (cr) { case 0: - kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); + res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val)); *rflags = kvm_x86_ops->get_rflags(vcpu); break; case 2: vcpu->arch.cr2 = val; break; case 3: - kvm_set_cr3(vcpu, val); + res = kvm_set_cr3(vcpu, val); break; case 4: - kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); + res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); break; case 8: kvm_set_cr8(vcpu, val & 0xfUL); @@ -3350,6 +4959,7 @@ void realmode_set_cr(struct kvm_vcpu *vc default: vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); } + return res; } static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) @@ -3400,15 +5010,10 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_ best = e; break; } - /* - * Both basic or both extended? - */ - if (((e->function ^ function) & 0x80000000) == 0) - if (!best || e->function > best->function) - best = e; } return best; } +EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) { @@ -3420,6 +5025,27 @@ int cpuid_maxphyaddr(struct kvm_vcpu *vc return 36; } +/* + * If no match is found, check whether we exceed the vCPU's limit + * and return the content of the highest valid _standard_ leaf instead. + * This is to satisfy the CPUID specification. + */ +static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu, + u32 function, u32 index) +{ + struct kvm_cpuid_entry2 *maxlevel; + + maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0); + if (!maxlevel || maxlevel->eax >= function) + return NULL; + if (function & 0x80000000) { + maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0); + if (!maxlevel) + return NULL; + } + return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index); +} + void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 function, index; @@ -3432,6 +5058,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu * kvm_register_write(vcpu, VCPU_REGS_RCX, 0); kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = kvm_find_cpuid_entry(vcpu, function, index); + + if (!best) + best = check_cpuid_limit(vcpu, function, index); + if (best) { kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); @@ -3453,17 +5083,17 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); * * No need to exit to userspace if we already have an interrupt queued. */ -static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) { return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && - kvm_run->request_interrupt_window && + vcpu->run->request_interrupt_window && kvm_arch_interrupt_allowed(vcpu)); } -static void post_kvm_run_save(struct kvm_vcpu *vcpu, - struct kvm_run *kvm_run) +static void post_kvm_run_save(struct kvm_vcpu *vcpu) { + struct kvm_run *kvm_run = vcpu->run; + kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; kvm_run->cr8 = kvm_get_cr8(vcpu); kvm_run->apic_base = kvm_get_apic_base(vcpu); @@ -3476,32 +5106,6 @@ static void post_kvm_run_save(struct kvm !kvm_event_needs_reinjection(vcpu); } -static void vapic_enter(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - struct page *page; - - if (!apic || !apic->vapic_addr) - return; - - page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - - vcpu->arch.apic->vapic_page = page; -} - -static void vapic_exit(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - if (!apic || !apic->vapic_addr) - return; - - down_read(&vcpu->kvm->slots_lock); - kvm_release_page_dirty(apic->vapic_page); - mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(&vcpu->kvm->slots_lock); -} - static void update_cr8_intercept(struct kvm_vcpu *vcpu) { int max_irr, tpr; @@ -3525,10 +5129,13 @@ static void update_cr8_intercept(struct kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); } -static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static void inject_pending_event(struct kvm_vcpu *vcpu) { /* try to reinject previous events if any */ if (vcpu->arch.exception.pending) { + trace_kvm_inj_exception(vcpu->arch.exception.nr, + vcpu->arch.exception.has_error_code, + vcpu->arch.exception.error_code); kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code); @@ -3561,11 +5168,12 @@ static void inject_pending_event(struct } } -static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; + bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && - kvm_run->request_interrupt_window; + vcpu->run->request_interrupt_window; if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) @@ -3578,29 +5186,48 @@ static int vcpu_enter_guest(struct kvm_v if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); - if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests)) - kvm_write_guest_time(vcpu); + if (test_and_clear_bit(KVM_REQ_GLOBAL_CLOCK_UPDATE, + &vcpu->requests)) + kvm_gen_kvmclock_update(vcpu); + if (test_and_clear_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests)) { + r = kvm_guest_time_update(vcpu); + if (unlikely(r)) + goto out; + } if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) kvm_mmu_sync_roots(vcpu); if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests)) { - kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; + vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; goto out; } if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { - kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; + vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; r = 0; goto out; } + if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) { + vcpu->fpu_active = 0; + kvm_x86_ops->fpu_deactivate(vcpu); + } + if (test_and_clear_bit(KVM_REQ_PMU, &vcpu->requests)) + kvm_handle_pmu_event(vcpu); + if (test_and_clear_bit(KVM_REQ_PMI, &vcpu->requests)) + kvm_deliver_pmi(vcpu); + if (test_and_clear_bit(KVM_REQ_STEAL_UPDATE, &vcpu->requests)) + record_steal_time(vcpu); } preempt_disable(); kvm_x86_ops->prepare_guest_switch(vcpu); kvm_load_guest_fpu(vcpu); + kvm_load_guest_xcr0(vcpu); + if (kvm_lapic_enabled(vcpu)) + kvm_lapic_prefault_vapic(vcpu); local_irq_disable(); @@ -3615,20 +5242,32 @@ static int vcpu_enter_guest(struct kvm_v goto out; } - inject_pending_event(vcpu, kvm_run); + /* + * An NMI can be injected between local nmi_pending read and + * vcpu->arch.nmi_pending read inside inject_pending_event(). + * But in that case, a kick IPI will be sent, which makes + * the race described above benign. + */ + nmi_pending = vcpu->arch.nmi_pending; + smp_mb(); + + inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) + if (nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); - kvm_lapic_sync_to_vapic(vcpu); + r = kvm_lapic_sync_to_vapic(vcpu); + if (unlikely(r) && + !test_and_set_bit(KVM_REQ_KICK, &vcpu->requests)) + smp_send_reschedule(vcpu->cpu); } - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_guest_enter(); @@ -3641,7 +5280,7 @@ static int vcpu_enter_guest(struct kvm_v } trace_kvm_entry(vcpu->vcpu_id); - kvm_x86_ops->run(vcpu, kvm_run); + kvm_x86_ops->run(vcpu); if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { set_debugreg(current->thread.debugreg0, 0); @@ -3652,6 +5291,7 @@ static int vcpu_enter_guest(struct kvm_v set_debugreg(current->thread.debugreg7, 7); } + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); set_bit(KVM_REQ_KICK, &vcpu->requests); local_irq_enable(); @@ -3669,7 +5309,7 @@ static int vcpu_enter_guest(struct kvm_v preempt_enable(); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); /* * Profile KVM exit RIPs: @@ -3679,18 +5319,19 @@ static int vcpu_enter_guest(struct kvm_v profile_hit(KVM_PROFILING, (void *)rip); } + if (vcpu->arch.apic_attention) + kvm_lapic_sync_from_vapic(vcpu); - kvm_lapic_sync_from_vapic(vcpu); - - r = kvm_x86_ops->handle_exit(kvm_run, vcpu); + r = kvm_x86_ops->handle_exit(vcpu); out: return r; } -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int __vcpu_run(struct kvm_vcpu *vcpu) { int r; + struct kvm *kvm = vcpu->kvm; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { pr_debug("vcpu %d received sipi with vector # %x\n", @@ -3702,17 +5343,16 @@ static int __vcpu_run(struct kvm_vcpu *v vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } - down_read(&vcpu->kvm->slots_lock); - vapic_enter(vcpu); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); r = 1; while (r > 0) { if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - r = vcpu_enter_guest(vcpu, kvm_run); + r = vcpu_enter_guest(vcpu); else { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) { switch(vcpu->arch.mp_state) { @@ -3736,27 +5376,24 @@ static int __vcpu_run(struct kvm_vcpu *v if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); - if (dm_request_for_irq_injection(vcpu, kvm_run)) { + if (dm_request_for_irq_injection(vcpu)) { r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.request_irq_exits; } if (signal_pending(current)) { r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->run->exit_reason = KVM_EXIT_INTR; ++vcpu->stat.signal_exits; } if (need_resched()) { - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); } } - up_read(&vcpu->kvm->slots_lock); - post_kvm_run_save(vcpu, kvm_run); - - vapic_exit(vcpu); + srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); return r; } @@ -3779,11 +5416,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v } /* re-sync apic's tpr */ - if (!irqchip_in_kernel(vcpu->kvm)) - kvm_set_cr8(vcpu, kvm_run->cr8); + if (!irqchip_in_kernel(vcpu->kvm)) { + if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { + r = -EINVAL; + goto out; + } + } if (vcpu->arch.pio.cur_count) { + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); r = complete_pio(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r) goto out; } @@ -3793,11 +5436,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v vcpu->mmio_read_completed = 1; vcpu->mmio_needed = 0; - down_read(&vcpu->kvm->slots_lock); - r = emulate_instruction(vcpu, kvm_run, - vcpu->arch.mmio_fault_cr2, 0, - EMULTYPE_NO_DECODE); - up_read(&vcpu->kvm->slots_lock); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = x86_emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, + EMULTYPE_NO_DECODE, NULL, 0); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); if (r == EMULATE_DO_MMIO) { /* * Read-modify-write. Back to userspace. @@ -3811,9 +5453,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_v kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); - r = __vcpu_run(vcpu, kvm_run); + r = __vcpu_run(vcpu); out: + post_kvm_run_save(vcpu); if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -3933,11 +5576,10 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct sregs->gdt.limit = dt.limit; sregs->gdt.base = dt.base; - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - sregs->cr0 = vcpu->arch.cr0; + sregs->cr0 = kvm_read_cr0(vcpu); sregs->cr2 = vcpu->arch.cr2; sregs->cr3 = vcpu->arch.cr3; - sregs->cr4 = vcpu->arch.cr4; + sregs->cr4 = kvm_read_cr4(vcpu); sregs->cr8 = kvm_get_cr8(vcpu); sregs->efer = vcpu->arch.shadow_efer; sregs->apic_base = kvm_get_apic_base(vcpu); @@ -4027,6 +5669,9 @@ static int load_guest_segment_descriptor { struct descriptor_table dtable; u16 index = selector >> 3; + int ret; + u32 err; + gva_t addr; get_segment_descriptor_dtable(vcpu, selector, &dtable); @@ -4034,7 +5679,13 @@ static int load_guest_segment_descriptor kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return 1; } - return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); + addr = dtable.base + index * 8; + ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + + return ret; } /* allowed just for 8 bytes segments */ @@ -4043,20 +5694,21 @@ static int save_guest_segment_descriptor { struct descriptor_table dtable; u16 index = selector >> 3; + int ret; + u32 err; + gva_t addr; get_segment_descriptor_dtable(vcpu, selector, &dtable); if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); -} - -static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc) -{ - u32 base_addr = get_desc_base(seg_desc); + addr = dtable.base + index*8; + ret = kvm_write_guest_virt_system(addr, seg_desc, + sizeof(*seg_desc), vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); - return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); + return ret; } static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) @@ -4067,18 +5719,6 @@ static u16 get_segment_selector(struct k return kvm_seg.selector; } -static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, - u16 selector, - struct kvm_segment *kvm_seg) -{ - struct desc_struct seg_desc; - - if (load_guest_segment_descriptor(vcpu, selector, &seg_desc)) - return 1; - seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg); - return 0; -} - static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { @@ -4096,7 +5736,7 @@ static int kvm_load_realmode_segment(str .unusable = 0, }; kvm_x86_ops->set_segment(vcpu, &segvar, seg); - return 0; + return X86EMUL_CONTINUE; } static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) @@ -4106,24 +5746,114 @@ static int is_vm86_segment(struct kvm_vc (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); } -int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment kvm_seg; + struct desc_struct seg_desc; + u8 dpl, rpl, cpl; + unsigned err_vec = GP_VECTOR; + u32 err_code = 0; + bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */ + int ret; - if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) + if (is_vm86_segment(vcpu, seg) || !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) return kvm_load_realmode_segment(vcpu, selector, seg); - if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) - return 1; - kvm_seg.type |= type_bits; - if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && - seg != VCPU_SREG_LDTR) - if (!kvm_seg.s) - kvm_seg.unusable = 1; + /* NULL selector is not valid for TR, CS and SS */ + if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR) + && null_selector) + goto exception; + + /* TR should be in GDT only */ + if (seg == VCPU_SREG_TR && (selector & (1 << 2))) + goto exception; + + ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret) + return ret; + + seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg); + + if (null_selector) { /* for NULL selector skip all following checks */ + kvm_seg.unusable = 1; + goto load; + } + + err_code = selector & 0xfffc; + err_vec = GP_VECTOR; + + /* can't load system descriptor into segment selecor */ + if (seg <= VCPU_SREG_GS && !kvm_seg.s) + goto exception; + + if (!kvm_seg.present) { + err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR; + goto exception; + } + + rpl = selector & 3; + dpl = kvm_seg.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + + switch (seg) { + case VCPU_SREG_SS: + /* + * segment is not a writable data segment or segment + * selector's RPL != CPL or segment selector's RPL != CPL + */ + if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl) + goto exception; + break; + case VCPU_SREG_CS: + if (!(kvm_seg.type & 8)) + goto exception; + + if (kvm_seg.type & 4) { + /* conforming */ + if (dpl > cpl) + goto exception; + } else { + /* nonconforming */ + if (rpl > cpl || dpl != cpl) + goto exception; + } + /* CS(RPL) <- CPL */ + selector = (selector & 0xfffc) | cpl; + break; + case VCPU_SREG_TR: + if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9)) + goto exception; + break; + case VCPU_SREG_LDTR: + if (kvm_seg.s || kvm_seg.type != 2) + goto exception; + break; + default: /* DS, ES, FS, or GS */ + /* + * segment is not a data or readable code segment or + * ((segment is a data or nonconforming code segment) + * and (both RPL and CPL > DPL)) + */ + if ((kvm_seg.type & 0xa) == 0x8 || + (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl))) + goto exception; + break; + } + if (!kvm_seg.unusable && kvm_seg.s) { + /* mark segment as accessed */ + kvm_seg.type |= 1; + seg_desc.type |= 1; + ret = save_guest_segment_descriptor(vcpu, selector, &seg_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + } +load: kvm_set_segment(vcpu, &kvm_seg, seg); - return 0; + return X86EMUL_CONTINUE; +exception: + kvm_queue_exception_e(vcpu, err_vec, err_code); + return X86EMUL_PROPAGATE_FAULT; } static void save_state_to_tss32(struct kvm_vcpu *vcpu, @@ -4149,9 +5879,19 @@ static void save_state_to_tss32(struct k tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); } +static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg) +{ + struct kvm_segment kvm_seg; + kvm_get_segment(vcpu, &kvm_seg, seg); + kvm_seg.selector = sel; + kvm_set_segment(vcpu, &kvm_seg, seg); +} + static int load_state_from_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { + int ret; + kvm_set_cr3(vcpu, tss->cr3); kvm_rip_write(vcpu, tss->eip); @@ -4166,26 +5906,49 @@ static int load_state_from_tss32(struct kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); - if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) - return 1; + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); + kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS); + kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS); - if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) - return 1; + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + ret = kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS); + if (ret != X86EMUL_CONTINUE) + return ret; return 0; } @@ -4208,12 +5971,13 @@ static void save_state_to_tss16(struct k tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); - tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR); } static int load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { + int ret; + kvm_rip_write(vcpu, tss->ip); kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); @@ -4225,20 +5989,39 @@ static int load_state_from_tss16(struct kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); - if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) - return 1; - - if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) - return 1; + /* + * SDM says that segment selectors are loaded before segment + * descriptors + */ + kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR); + kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES); + kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS); + kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS); + kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS); - if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) - return 1; + /* + * Now load segment descriptors. If fault happenes at this stage + * it is handled in a context of new task + */ + ret = kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS); + if (ret != X86EMUL_CONTINUE) + return ret; + + ret = kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS); + if (ret != X86EMUL_CONTINUE) + return ret; return 0; } @@ -4247,104 +6030,206 @@ static int kvm_task_switch_16(struct kvm struct desc_struct *nseg_desc) { struct tss_segment_16 tss_segment_16; - int ret = 0; - - if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, - sizeof tss_segment_16)) + int ret; + u32 err; + gva_t addr; + u32 new_tss_base = get_desc_base(nseg_desc); + + addr = old_tss_base; + ret = kvm_read_guest_virt_system(addr, &tss_segment_16, + sizeof tss_segment_16, + vcpu, &err); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ goto out; save_state_to_tss16(vcpu, &tss_segment_16); - if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, - sizeof tss_segment_16)) + ret = kvm_write_guest_virt_system(addr, &tss_segment_16, + sizeof tss_segment_16, + vcpu, &err); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), - &tss_segment_16, sizeof tss_segment_16)) + addr = new_tss_base; + ret = kvm_read_guest_virt_system(addr, &tss_segment_16, + sizeof tss_segment_16, + vcpu, &err); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ goto out; if (old_tss_sel != 0xffff) { tss_segment_16.prev_task_link = old_tss_sel; - if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), - &tss_segment_16.prev_task_link, - sizeof tss_segment_16.prev_task_link)) + ret = kvm_write_guest_virt_system(addr, + &tss_segment_16.prev_task_link, + sizeof tss_segment_16.prev_task_link, + vcpu, &err); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ goto out; } - if (load_state_from_tss16(vcpu, &tss_segment_16)) - goto out; + ret = load_state_from_tss16(vcpu, &tss_segment_16); + if (ret != X86EMUL_CONTINUE) + /* Fault already injected, exit. */ + return ret; - ret = 1; out: + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); return ret; } static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, u16 old_tss_sel, u32 old_tss_base, - struct desc_struct *nseg_desc) + struct desc_struct *nseg_desc, + bool has_error_code, u32 error_code) { struct tss_segment_32 tss_segment_32; - int ret = 0; - - if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, - sizeof tss_segment_32)) + int ret; + u32 err; + gva_t addr; + u32 new_tss_base = get_desc_base(nseg_desc); + + addr = old_tss_base; + ret = kvm_read_guest_virt_system(addr, &tss_segment_32, + sizeof tss_segment_32, + vcpu, &err); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ goto out; save_state_to_tss32(vcpu, &tss_segment_32); - if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, - sizeof tss_segment_32)) + ret = kvm_write_guest_virt_system(addr, &tss_segment_32, + sizeof tss_segment_32, + vcpu, &err); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ goto out; - if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), - &tss_segment_32, sizeof tss_segment_32)) + addr = new_tss_base; + ret = kvm_read_guest_virt_system(addr, &tss_segment_32, + sizeof tss_segment_32, + vcpu, &err); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ goto out; if (old_tss_sel != 0xffff) { tss_segment_32.prev_task_link = old_tss_sel; - if (kvm_write_guest(vcpu->kvm, - get_tss_base_addr(vcpu, nseg_desc), - &tss_segment_32.prev_task_link, - sizeof tss_segment_32.prev_task_link)) + ret = kvm_write_guest_virt_system(addr, + &tss_segment_32.prev_task_link, + sizeof tss_segment_32.prev_task_link, + vcpu, &err); + if (ret != X86EMUL_CONTINUE) + /* FIXME: need to provide precise fault address */ goto out; } - if (load_state_from_tss32(vcpu, &tss_segment_32)) - goto out; + ret = load_state_from_tss32(vcpu, &tss_segment_32); + if (ret != X86EMUL_CONTINUE) + /* Fault already injected, exit. */ + return ret; + + if (has_error_code) { + /* put error on a stack of the new task */ + gva_t sp = kvm_register_read(vcpu, VCPU_REGS_RSP) - 4; + u32 limit; + struct kvm_segment ss; + + kvm_get_segment(vcpu, &ss, VCPU_SREG_SS); + limit = (ss.g ? (ss.limit << 12) | 0xfff : ss.limit) + 1; + addr = sp + ss.base; + if (limit) + addr %= limit; + ret = kvm_write_guest_virt_system(addr, &error_code, 4, vcpu, &err); + if (ret != X86EMUL_CONTINUE) + goto out; + + kvm_register_write(vcpu, VCPU_REGS_RSP, sp); + } - ret = 1; out: + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + return ret; +} + +static int read_interrupt_descriptor(struct kvm_vcpu *vcpu, + u16 index, struct desc_struct *desc) +{ + struct descriptor_table dt; + int ret; + u32 err; + gva_t addr; + + kvm_x86_ops->get_idt(vcpu, &dt); + + if (dt.limit < index * 8 + 7) { + kvm_queue_exception_e(vcpu, GP_VECTOR, (index << 3) | 0x2); + return 1; + } + addr = dt.base + index * 8; + ret = kvm_read_guest_virt_system(addr, desc, sizeof(*desc), + vcpu, &err); + if (ret == X86EMUL_PROPAGATE_FAULT) + kvm_inject_page_fault(vcpu, addr, err); + return ret; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, + int idt_index, int reason, + bool has_error_code, u32 error_code) { struct kvm_segment tr_seg; struct desc_struct cseg_desc; struct desc_struct nseg_desc; - int ret = 0; + int ret; u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); - - /* FIXME: Handle errors. Failure to read either TSS or their - * descriptors should generate a pagefault. - */ if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) - goto out; + return 0; if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) - goto out; + return 0; - if (reason != TASK_SWITCH_IRET) { - int cpl; + /* + * Check privileges. The three cases are task switch caused by... + * + * 1. jmp/call/int to task gate: Check against DPL of the task gate + * 2. Exception/IRQ/iret: No check is performed + * 3. jmp/call to TSS: Check agains DPL of the TSS + */ + if (reason == TASK_SWITCH_GATE) { + if (idt_index != -1) { + /* Software interrupts */ + struct desc_struct task_gate_desc; + int dpl, cpl; + + if (read_interrupt_descriptor(vcpu, idt_index, + &task_gate_desc)) + return 0; + + dpl = task_gate_desc.dpl; + cpl = kvm_x86_ops->get_cpl(vcpu); + if ((tss_selector & 3) > dpl || cpl > dpl) { + kvm_queue_exception_e(vcpu, GP_VECTOR, (idt_index << 3) | 0x2); + return 1; + } + } + } else if (reason != TASK_SWITCH_IRET) { + int dpl, cpl; + dpl = nseg_desc.dpl; cpl = kvm_x86_ops->get_cpl(vcpu); - if ((tss_selector & 3) > nseg_desc.dpl || cpl > nseg_desc.dpl) { + if ((tss_selector & 3) > dpl || cpl > dpl) { kvm_queue_exception_e(vcpu, GP_VECTOR, 0); return 1; } @@ -4357,7 +6242,10 @@ int kvm_task_switch(struct kvm_vcpu *vcp if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { cseg_desc.type &= ~(1 << 1); //clear the B flag - save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); + ret = save_guest_segment_descriptor(vcpu, old_tss_sel, + &cseg_desc); + if (ret != X86EMUL_CONTINUE) + return 1; } if (reason == TASK_SWITCH_IRET) { @@ -4377,11 +6265,16 @@ int kvm_task_switch(struct kvm_vcpu *vcp if (nseg_desc.type & 8) ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, - old_tss_base, &nseg_desc); + old_tss_base, &nseg_desc, + has_error_code, error_code); else ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel, old_tss_base, &nseg_desc); + /* Proceed in new task if a page fault happened, otherwise exit. */ + if (ret == X86EMUL_UNHANDLEABLE) + return 0; + if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { u32 eflags = kvm_x86_ops->get_rflags(vcpu); kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); @@ -4393,12 +6286,11 @@ int kvm_task_switch(struct kvm_vcpu *vcp &nseg_desc); } - kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); + kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); -out: - return ret; + return 1; } EXPORT_SYMBOL_GPL(kvm_task_switch); @@ -4407,10 +6299,16 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct { int mmu_reset_needed = 0; int pending_vec, max_bits; + int ret = 0; struct descriptor_table dt; vcpu_load(vcpu); + if (!guest_cpuid_has_xsave(vcpu) && (sregs->cr4 & X86_CR4_OSXSAVE)) { + ret = -EINVAL; + goto out; + } + dt.limit = sregs->idt.limit; dt.base = sregs->idt.base; kvm_x86_ops->set_idt(vcpu, &dt); @@ -4428,14 +6326,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_x86_ops->set_efer(vcpu, sregs->efer); kvm_set_apic_base(vcpu, sregs->apic_base); - kvm_x86_ops->decache_cr4_guest_bits(vcpu); - mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0; + mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; kvm_x86_ops->set_cr0(vcpu, sregs->cr0); vcpu->arch.cr0 = sregs->cr0; - mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; + mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; kvm_x86_ops->set_cr4(vcpu, sregs->cr4); + if (sregs->cr4 & X86_CR4_OSXSAVE) + update_cpuid(vcpu); if (!is_long_mode(vcpu) && is_pae(vcpu)) load_pdptrs(vcpu, vcpu->arch.cr3); @@ -4467,12 +6366,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct /* Older userspace won't unhalt the vcpu on reset. */ if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && - !(vcpu->arch.cr0 & X86_CR0_PE)) + !(kvm_read_cr0_bits(vcpu, X86_CR0_PE))) vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; +out: vcpu_put(vcpu); - return 0; + return ret; } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, @@ -4507,27 +6407,6 @@ int kvm_arch_vcpu_ioctl_set_guest_debug( } /* - * fxsave fpu state. Taken from x86_64/processor.h. To be killed when - * we have asm/x86/processor.h - */ -struct fxsave { - u16 cwd; - u16 swd; - u16 twd; - u16 fop; - u64 rip; - u64 rdp; - u32 mxcsr; - u32 mxcsr_mask; - u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ -#ifdef CONFIG_X86_64 - u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ -#else - u32 xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */ -#endif -}; - -/* * Translate a guest virtual address to a guest physical address. */ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, @@ -4535,11 +6414,12 @@ int kvm_arch_vcpu_ioctl_translate(struct { unsigned long vaddr = tr->linear_address; gpa_t gpa; + int idx; vcpu_load(vcpu); - down_read(&vcpu->kvm->slots_lock); - gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); - up_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); + gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); + srcu_read_unlock(&vcpu->kvm->srcu, idx); tr->physical_address = gpa; tr->valid = gpa != UNMAPPED_GVA; tr->writeable = 1; @@ -4551,7 +6431,8 @@ int kvm_arch_vcpu_ioctl_translate(struct int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; vcpu_load(vcpu); @@ -4571,7 +6452,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct k int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image; + struct i387_fxsave_struct *fxsave = + &vcpu->arch.guest_fpu.state->fxsave; vcpu_load(vcpu); @@ -4591,63 +6473,60 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct k void fx_init(struct kvm_vcpu *vcpu) { - unsigned after_mxcsr_mask; + fpu_alloc(&vcpu->arch.guest_fpu); + fpu_finit(&vcpu->arch.guest_fpu); /* - * Touch the fpu the first time in non atomic context as if - * this is the first fpu instruction the exception handler - * will fire before the instruction returns and it'll have to - * allocate ram with GFP_KERNEL. + * Ensure guest xcr0 is valid for loading */ - if (!used_math()) - kvm_fx_save(&vcpu->arch.host_fx_image); - - /* Initialize guest FPU by resetting ours and saving into guest's */ - preempt_disable(); - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_finit(); - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); - preempt_enable(); + vcpu->arch.xcr0 = XSTATE_FP; vcpu->arch.cr0 |= X86_CR0_ET; - after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space); - vcpu->arch.guest_fx_image.mxcsr = 0x1f80; - memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask, - 0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask); } EXPORT_SYMBOL_GPL(fx_init); +static void fx_free(struct kvm_vcpu *vcpu) +{ + fpu_free(&vcpu->arch.guest_fpu); +} + void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) return; + /* + * Restore all possible states in the guest, + * and assume host would use all available bits. + * Guest xcr0 would be loaded later. + */ + kvm_put_guest_xcr0(vcpu); vcpu->guest_fpu_loaded = 1; - kvm_fx_save(&vcpu->arch.host_fx_image); - kvm_fx_restore(&vcpu->arch.guest_fx_image); + kernel_fpu_begin(); + fpu_restore_checking(&vcpu->arch.guest_fpu); } EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { + kvm_put_guest_xcr0(vcpu); + if (!vcpu->guest_fpu_loaded) return; vcpu->guest_fpu_loaded = 0; - kvm_fx_save(&vcpu->arch.guest_fx_image); - kvm_fx_restore(&vcpu->arch.host_fx_image); + fpu_save_init(&vcpu->arch.guest_fpu); + kernel_fpu_end(); ++vcpu->stat.fpu_reload; + set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests); } EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) { - if (vcpu->arch.time_page) { - kvm_release_page_dirty(vcpu->arch.time_page); - vcpu->arch.time_page = NULL; - } + kvmclock_reset(vcpu); + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -4661,9 +6540,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu { int r; - /* We do fxsave: this must be aligned. */ - BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF); - vcpu->arch.mtrr_state.have_fixed = 1; vcpu_load(vcpu); r = kvm_arch_vcpu_reset(vcpu); @@ -4679,12 +6555,31 @@ free_vcpu: return r; } +int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ + struct msr_data msr; + struct kvm *kvm = vcpu->kvm; + + vcpu_load(vcpu); + msr.data = 0x0; + msr.index = MSR_IA32_TSC; + msr.host_initiated = true; + kvm_write_tsc(vcpu, &msr); + vcpu_put(vcpu); + + schedule_delayed_work(&kvm->arch.kvmclock_sync_work, + KVMCLOCK_SYNC_PERIOD); + + return 0; +} + void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); + fx_free(vcpu); kvm_x86_ops->vcpu_free(vcpu); } @@ -4694,21 +6589,102 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu vcpu->arch.nmi_injected = false; vcpu->arch.switch_db_regs = 0; + vcpu->arch.st.msr_val = 0; memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = DR6_FIXED_1; vcpu->arch.dr7 = DR7_FIXED_1; + kvmclock_reset(vcpu); + kvm_pmu_reset(vcpu); + return kvm_x86_ops->vcpu_reset(vcpu); } -void kvm_arch_hardware_enable(void *garbage) +int kvm_arch_hardware_enable(void *garbage) { - kvm_x86_ops->hardware_enable(garbage); + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i, rc; + u64 local_tsc; + u64 max_tsc = 0; + bool stable, backwards_tsc = false; + + kvm_shared_msr_cpu_online(); + local_tsc = native_read_tsc(); + stable = !check_tsc_unstable(); + rc = kvm_x86_ops->hardware_enable(garbage); + if (rc) + return rc; + + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!stable && vcpu->cpu == smp_processor_id()) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + if (stable && vcpu->arch.last_host_tsc > local_tsc) { + backwards_tsc = true; + if (vcpu->arch.last_host_tsc > max_tsc) + max_tsc = vcpu->arch.last_host_tsc; + } + } + } + + /* + * Sometimes, even reliable TSCs go backwards. This happens on + * platforms that reset TSC during suspend or hibernate actions, but + * maintain synchronization. We must compensate. Fortunately, we can + * detect that condition here, which happens early in CPU bringup, + * before any KVM threads can be running. Unfortunately, we can't + * bring the TSCs fully up to date with real time, as we aren't yet far + * enough into CPU bringup that we know how much real time has actually + * elapsed; our helper function, get_kernel_ns() will be using boot + * variables that haven't been updated yet. + * + * So we simply find the maximum observed TSC above, then record the + * adjustment to TSC in each VCPU. When the VCPU later gets loaded, + * the adjustment will be applied. Note that we accumulate + * adjustments, in case multiple suspend cycles happen before some VCPU + * gets a chance to run again. In the event that no KVM threads get a + * chance to run, we will miss the entire elapsed period, as we'll have + * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may + * loose cycle time. The unlikeliness of that event and the difficulty + * of synchronizing the adjustment once we have have KVM threads + * running make this simpler detection logic preferable. Even then, + * observe that the TSC will not go backwards (as no threads have run, + * and this makes invariant the fact that if any KVM threads run, all + * TSCs will be adjusted at the next suspend / hibernate action). It + * is possible that the second hibernate recovery happens much faster + * than the previous, causing the observed TSC here to be smaller; this + * would require additional padding adjustment, which is why we set + * last_host_tsc to the TSC observed here. + * + * N.B. - this code below runs only on platforms with reliable TSC, + * as that is the only way backwards_tsc is set above. Also note + * that this runs for ALL vcpus, which is not a bug; all VCPUs should + * have the same delta_cyc adjustment applied if backwards_tsc + * is detected. Note further, this adjustment is only done once, + * as we reset last_host_tsc to zero on all VCPUs to stop this from + * being called multiple times (one for each physical CPU bringup). + * + * Platforms with unnreliable TSCs will be compensated by the logic in + * vcpu_load, which sets the TSC to catchup mode. This will catchup + * all VCPUs to real time, but cannot guarantee that they stay in + * perfect synchronization. Fix thou thine TSC and all shalt be well. + */ + if (backwards_tsc) { + u64 delta_cyc = max_tsc - local_tsc; + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.tsc_offset_adjustment += delta_cyc; + vcpu->arch.last_host_tsc = local_tsc; + } + } + return 0; } void kvm_arch_hardware_disable(void *garbage) { kvm_x86_ops->hardware_disable(garbage); + drop_user_return_notifiers(garbage); } int kvm_arch_hardware_setup(void) @@ -4726,6 +6702,11 @@ void kvm_arch_check_processor_compat(voi kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; @@ -4748,6 +6729,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu * } vcpu->arch.pio_data = page_address(page); + kvm_init_tsc_catchup(vcpu, max_tsc_khz); + r = kvm_mmu_create(vcpu); if (r < 0) goto fail_free_pio_data; @@ -4762,12 +6745,16 @@ int kvm_arch_vcpu_init(struct kvm_vcpu * GFP_KERNEL); if (!vcpu->arch.mce_banks) { r = -ENOMEM; - goto fail_mmu_destroy; + goto fail_free_lapic; } vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; - return 0; + vcpu->arch.pv_time_enabled = false; + kvm_pmu_init(vcpu); + return 0; +fail_free_lapic: + kvm_free_lapic(vcpu); fail_mmu_destroy: kvm_mmu_destroy(vcpu); fail_free_pio_data: @@ -4778,13 +6765,22 @@ fail: void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) { + int idx; + + kvm_pmu_destroy(vcpu); + kfree(vcpu->arch.mce_banks); kvm_free_lapic(vcpu); - down_read(&vcpu->kvm->slots_lock); + idx = srcu_read_lock(&vcpu->kvm->srcu); kvm_mmu_destroy(vcpu); - up_read(&vcpu->kvm->slots_lock); + srcu_read_unlock(&vcpu->kvm->srcu, idx); free_page((unsigned long)vcpu->arch.pio_data); } +void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) +{ + kvm_x86_ops->sched_in(vcpu, cpu); +} + struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); @@ -4792,13 +6788,22 @@ struct kvm *kvm_arch_create_vm(void) if (!kvm) return ERR_PTR(-ENOMEM); + kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL); + if (!kvm->arch.aliases) { + kfree(kvm); + return ERR_PTR(-ENOMEM); + } + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); - rdtscll(kvm->arch.vm_init_tsc); + spin_lock_init(&kvm->arch.tsc_write_lock); + + INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); + INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); return kvm; } @@ -4833,13 +6838,15 @@ static void kvm_free_vcpus(struct kvm *k void kvm_arch_sync_events(struct kvm *kvm) { + cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); + cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); kvm_free_all_assigned_devices(kvm); + kvm_free_pit(kvm); } void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); - kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); kvm_free_vcpus(kvm); @@ -4848,16 +6855,23 @@ void kvm_arch_destroy_vm(struct kvm *kvm put_page(kvm->arch.apic_access_page); if (kvm->arch.ept_identity_pagetable) put_page(kvm->arch.ept_identity_pagetable); + cleanup_srcu_struct(&kvm->srcu); + kfree(kvm->arch.aliases); kfree(kvm); } -int kvm_arch_set_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, struct kvm_memory_slot old, + struct kvm_userspace_memory_region *mem, int user_alloc) { - int npages = mem->memory_size >> PAGE_SHIFT; - struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot]; + int npages = memslot->npages; + int map_flags = MAP_PRIVATE | MAP_ANONYMOUS; + + /* Prevent internal slot pages from being moved by fork()/COW. */ + if (memslot->id >= KVM_MEMORY_SLOTS) + map_flags = MAP_SHARED | MAP_ANONYMOUS; /*To keep backward compatibility with older userspace, *x86 needs to hanlde !user_alloc case. @@ -4870,43 +6884,50 @@ int kvm_arch_set_memory_region(struct kv userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, + map_flags, 0); up_write(¤t->mm->mmap_sem); if (IS_ERR((void *)userspace_addr)) return PTR_ERR((void *)userspace_addr); - /* set userspace_addr atomically for kvm_hva_to_rmapp */ - spin_lock(&kvm->mmu_lock); memslot->userspace_addr = userspace_addr; - spin_unlock(&kvm->mmu_lock); - } else { - if (!old.user_alloc && old.rmap) { - int ret; - - down_write(¤t->mm->mmap_sem); - ret = do_munmap(current->mm, old.userspace_addr, - old.npages * PAGE_SIZE); - up_write(¤t->mm->mmap_sem); - if (ret < 0) - printk(KERN_WARNING - "kvm_vm_ioctl_set_memory_region: " - "failed to munmap memory\n"); - } } } + + return 0; +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + struct kvm_memory_slot old, + int user_alloc) +{ + + int npages = mem->memory_size >> PAGE_SHIFT; + + if (!user_alloc && !old.user_alloc && old.rmap && !npages) { + int ret; + + down_write(¤t->mm->mmap_sem); + ret = do_munmap(current->mm, old.userspace_addr, + old.npages * PAGE_SIZE); + up_write(¤t->mm->mmap_sem); + if (ret < 0) + printk(KERN_WARNING + "kvm_vm_ioctl_set_memory_region: " + "failed to munmap memory\n"); + } + spin_lock(&kvm->mmu_lock); if (!kvm->arch.n_requested_mmu_pages) { unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); } - kvm_mmu_slot_remove_write_access(kvm, mem->slot); + kvm_mmu_slot_remove_write_access(kvm, mem->slot, true); spin_unlock(&kvm->mmu_lock); - - return 0; } void kvm_arch_flush_shadow(struct kvm *kvm) @@ -4951,3 +6972,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_vir EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); diff -uprN linux-2.6.32/arch/x86/kvm/x86.h linux-2.6.32.ovz/arch/x86/kvm/x86.h --- linux-2.6.32/arch/x86/kvm/x86.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/kvm/x86.h 2015-03-10 13:24:02.000000000 -0700 @@ -32,7 +32,28 @@ static inline bool kvm_exception_is_soft return (nr == BP_VECTOR) || (nr == OF_VECTOR); } +static inline u32 bit(int bitno) +{ + return 1 << (bitno & 31); +} + +struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, + u32 function, u32 index); + +static inline bool guest_cpuid_has_pcid(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 1, 0); + return best && (best->ecx & bit(X86_FEATURE_PCID)); +} + struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, u32 function, u32 index); +void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); + +void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); +void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); + #endif diff -uprN linux-2.6.32/arch/x86/lib/cache-smp.c linux-2.6.32.ovz/arch/x86/lib/cache-smp.c --- linux-2.6.32/arch/x86/lib/cache-smp.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/cache-smp.c 2015-03-10 13:21:18.000000000 -0700 @@ -0,0 +1,20 @@ +#include +#include + +static void __wbinvd(void *dummy) +{ + wbinvd(); +} + +void wbinvd_on_cpu(int cpu) +{ + smp_call_function_single(cpu, __wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_cpu); + +int wbinvd_on_all_cpus(void) +{ + return on_each_cpu(__wbinvd, NULL, 1); +} +EXPORT_SYMBOL(wbinvd_on_all_cpus); + diff -uprN linux-2.6.32/arch/x86/lib/clear_page_64.S linux-2.6.32.ovz/arch/x86/lib/clear_page_64.S --- linux-2.6.32/arch/x86/lib/clear_page_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/clear_page_64.S 2015-03-10 13:22:35.000000000 -0700 @@ -1,5 +1,6 @@ #include #include +#include /* * Zero a page. @@ -14,6 +15,15 @@ ENTRY(clear_page_c) CFI_ENDPROC ENDPROC(clear_page_c) +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) + ENTRY(clear_page) CFI_STARTPROC xorl %eax,%eax @@ -38,21 +48,26 @@ ENTRY(clear_page) .Lclear_page_end: ENDPROC(clear_page) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ + /* + * Some CPUs support enhanced REP MOVSB/STOSB instructions. + * It is recommended to use this when possible. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original function. + * + */ #include .section .altinstr_replacement,"ax" 1: .byte 0xeb /* jmp */ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: +2: .byte 0xeb /* jmp */ + .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ +3: .previous .section .altinstructions,"a" - .align 8 - .quad clear_page - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lclear_page_end - clear_page - .byte 2b - 1b + altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ + .Lclear_page_end-clear_page, 2b-1b + altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ + .Lclear_page_end-clear_page,3b-2b .previous diff -uprN linux-2.6.32/arch/x86/lib/copy_page_64.S linux-2.6.32.ovz/arch/x86/lib/copy_page_64.S --- linux-2.6.32/arch/x86/lib/copy_page_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/copy_page_64.S 2015-03-10 13:22:35.000000000 -0700 @@ -113,7 +113,7 @@ ENDPROC(copy_page) .align 8 .quad copy_page .quad 1b - .byte X86_FEATURE_REP_GOOD + .word X86_FEATURE_REP_GOOD .byte .Lcopy_page_end - copy_page .byte 2b - 1b .previous diff -uprN linux-2.6.32/arch/x86/lib/copy_user_64.S linux-2.6.32.ovz/arch/x86/lib/copy_user_64.S --- linux-2.6.32/arch/x86/lib/copy_user_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/copy_user_64.S 2015-03-10 13:22:35.000000000 -0700 @@ -15,23 +15,30 @@ #include #include #include +#include - .macro ALTERNATIVE_JUMP feature,orig,alt +/* + * By placing feature2 after feature1 in altinstructions section, we logically + * implement: + * If CPU has feature2, jmp to alt2 is used + * else if CPU has feature1, jmp to alt1 is used + * else jmp to orig is used. + */ + .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 0: .byte 0xe9 /* 32bit jump */ .long \orig-1f /* by default jump to orig */ 1: .section .altinstr_replacement,"ax" 2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt-1b /* offset */ /* or alternatively to alt */ + .long \alt1-1b /* offset */ /* or alternatively to alt1 */ +3: .byte 0xe9 /* near jump with 32bit immediate */ + .long \alt2-1b /* offset */ /* or alternatively to alt2 */ .previous + .section .altinstructions,"a" - .align 8 - .quad 0b - .quad 2b - .byte \feature /* when feature is set */ - .byte 5 - .byte 5 + altinstruction_entry 0b,2b,\feature1,5,5 + altinstruction_entry 0b,3b,\feature2,5,5 .previous .endm @@ -73,7 +80,9 @@ ENTRY(copy_to_user) jc bad_to_user cmpq TI_addr_limit(%rax),%rcx jae bad_to_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC ENDPROC(copy_to_user) @@ -86,22 +95,12 @@ ENTRY(copy_from_user) jc bad_from_user cmpq TI_addr_limit(%rax),%rcx jae bad_from_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string + ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ + copy_user_generic_unrolled,copy_user_generic_string, \ + copy_user_enhanced_fast_string CFI_ENDPROC ENDPROC(copy_from_user) -ENTRY(copy_user_generic) - CFI_STARTPROC - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(copy_user_generic) - -ENTRY(__copy_from_user_inatomic) - CFI_STARTPROC - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string - CFI_ENDPROC -ENDPROC(__copy_from_user_inatomic) - .section .fixup,"ax" /* must zero dest */ ENTRY(bad_from_user) @@ -267,3 +266,37 @@ ENTRY(copy_user_generic_string) .previous CFI_ENDPROC ENDPROC(copy_user_generic_string) + +/* + * Some CPUs are adding enhanced REP MOVSB/STOSB instructions. + * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ENTRY(copy_user_enhanced_fast_string) + CFI_STARTPROC + andl %edx,%edx + jz 2f + movl %edx,%ecx +1: rep + movsb +2: xorl %eax,%eax + ret + + .section .fixup,"ax" +12: movl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + .section __ex_table,"a" + .align 8 + .quad 1b,12b + .previous + CFI_ENDPROC +ENDPROC(copy_user_enhanced_fast_string) diff -uprN linux-2.6.32/arch/x86/lib/inat.c linux-2.6.32.ovz/arch/x86/lib/inat.c --- linux-2.6.32/arch/x86/lib/inat.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/inat.c 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,90 @@ +/* + * x86 instruction attribute tables + * + * Written by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + */ +#include + +/* Attribute tables are generated from opcode map */ +#include "inat-tables.c" + +/* Attribute search APIs */ +insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode) +{ + return inat_primary_table[opcode]; +} + +insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx, + insn_attr_t esc_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = inat_escape_id(esc_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = inat_last_prefix_id(lpfx_attr); + } + table = inat_escape_tables[n][0]; + if (!table) + return 0; + if (inat_has_variant(table[opcode]) && m) { + table = inat_escape_tables[n][m]; + if (!table) + return 0; + } + return table[opcode]; +} + +insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx, + insn_attr_t grp_attr) +{ + const insn_attr_t *table; + insn_attr_t lpfx_attr; + int n, m = 0; + + n = inat_group_id(grp_attr); + if (last_pfx) { + lpfx_attr = inat_get_opcode_attribute(last_pfx); + m = inat_last_prefix_id(lpfx_attr); + } + table = inat_group_tables[n][0]; + if (!table) + return inat_group_common_attribute(grp_attr); + if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) { + table = inat_group_tables[n][m]; + if (!table) + return inat_group_common_attribute(grp_attr); + } + return table[X86_MODRM_REG(modrm)] | + inat_group_common_attribute(grp_attr); +} + +insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, + insn_byte_t vex_p) +{ + const insn_attr_t *table; + if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) + return 0; + table = inat_avx_tables[vex_m][vex_p]; + if (!table) + return 0; + return table[opcode]; +} + diff -uprN linux-2.6.32/arch/x86/lib/insn.c linux-2.6.32.ovz/arch/x86/lib/insn.c --- linux-2.6.32/arch/x86/lib/insn.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/insn.c 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,516 @@ +/* + * x86 instruction analysis + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004, 2009 + */ + +#include +#include +#include + +#define get_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; }) + +#define peek_next(t, insn) \ + ({t r; r = *(t*)insn->next_byte; r; }) + +#define peek_nbyte_next(t, insn, n) \ + ({t r; r = *(t*)((insn)->next_byte + n); r; }) + +/** + * insn_init() - initialize struct insn + * @insn: &struct insn to be initialized + * @kaddr: address (in kernel memory) of instruction (or copy thereof) + * @x86_64: !0 for 64-bit kernel or 64-bit app + */ +void insn_init(struct insn *insn, const void *kaddr, int x86_64) +{ + memset(insn, 0, sizeof(*insn)); + insn->kaddr = kaddr; + insn->next_byte = kaddr; + insn->x86_64 = x86_64 ? 1 : 0; + insn->opnd_bytes = 4; + if (x86_64) + insn->addr_bytes = 8; + else + insn->addr_bytes = 4; +} + +/** + * insn_get_prefixes - scan x86 instruction prefix bytes + * @insn: &struct insn containing instruction + * + * Populates the @insn->prefixes bitmap, and updates @insn->next_byte + * to point to the (first) opcode. No effect if @insn->prefixes.got + * is already set. + */ +void insn_get_prefixes(struct insn *insn) +{ + struct insn_field *prefixes = &insn->prefixes; + insn_attr_t attr; + insn_byte_t b, lb; + int i, nb; + + if (prefixes->got) + return; + + nb = 0; + lb = 0; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + while (inat_is_legacy_prefix(attr)) { + /* Skip if same prefix */ + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == b) + goto found; + if (nb == 4) + /* Invalid instruction */ + break; + prefixes->bytes[nb++] = b; + if (inat_is_address_size_prefix(attr)) { + /* address size switches 2/4 or 4/8 */ + if (insn->x86_64) + insn->addr_bytes ^= 12; + else + insn->addr_bytes ^= 6; + } else if (inat_is_operand_size_prefix(attr)) { + /* oprand size switches 2/4 */ + insn->opnd_bytes ^= 6; + } +found: + prefixes->nbytes++; + insn->next_byte++; + lb = b; + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + } + /* Set the last prefix */ + if (lb && lb != insn->prefixes.bytes[3]) { + if (unlikely(insn->prefixes.bytes[3])) { + /* Swap the last prefix */ + b = insn->prefixes.bytes[3]; + for (i = 0; i < nb; i++) + if (prefixes->bytes[i] == lb) + prefixes->bytes[i] = b; + } + insn->prefixes.bytes[3] = lb; + } + + /* Decode REX prefix */ + if (insn->x86_64) { + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_rex_prefix(attr)) { + insn->rex_prefix.value = b; + insn->rex_prefix.nbytes = 1; + insn->next_byte++; + if (X86_REX_W(b)) + /* REX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } + } + insn->rex_prefix.got = 1; + + /* Decode VEX prefix */ + b = peek_next(insn_byte_t, insn); + attr = inat_get_opcode_attribute(b); + if (inat_is_vex_prefix(attr)) { + insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); + if (!insn->x86_64) { + /* + * In 32-bits mode, if the [7:6] bits (mod bits of + * ModRM) on the second byte are not 11b, it is + * LDS or LES. + */ + if (X86_MODRM_MOD(b2) != 3) + goto vex_end; + } + insn->vex_prefix.bytes[0] = b; + insn->vex_prefix.bytes[1] = b2; + if (inat_is_vex3_prefix(attr)) { + b2 = peek_nbyte_next(insn_byte_t, insn, 2); + insn->vex_prefix.bytes[2] = b2; + insn->vex_prefix.nbytes = 3; + insn->next_byte += 3; + if (insn->x86_64 && X86_VEX_W(b2)) + /* VEX.W overrides opnd_size */ + insn->opnd_bytes = 8; + } else { + insn->vex_prefix.nbytes = 2; + insn->next_byte += 2; + } + } +vex_end: + insn->vex_prefix.got = 1; + + prefixes->got = 1; + return; +} + +/** + * insn_get_opcode - collect opcode(s) + * @insn: &struct insn containing instruction + * + * Populates @insn->opcode, updates @insn->next_byte to point past the + * opcode byte(s), and set @insn->attr (except for groups). + * If necessary, first collects any preceding (prefix) bytes. + * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got + * is already 1. + */ +void insn_get_opcode(struct insn *insn) +{ + struct insn_field *opcode = &insn->opcode; + insn_byte_t op, pfx; + if (opcode->got) + return; + if (!insn->prefixes.got) + insn_get_prefixes(insn); + + /* Get first opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[0] = op; + opcode->nbytes = 1; + + /* Check if there is VEX prefix or not */ + if (insn_is_avx(insn)) { + insn_byte_t m, p; + m = insn_vex_m_bits(insn); + p = insn_vex_p_bits(insn); + insn->attr = inat_get_avx_attribute(op, m, p); + if (!inat_accept_vex(insn->attr)) + insn->attr = 0; /* This instruction is bad */ + goto end; /* VEX has only 1 byte for opcode */ + } + + insn->attr = inat_get_opcode_attribute(op); + while (inat_is_escape(insn->attr)) { + /* Get escaped opcode */ + op = get_next(insn_byte_t, insn); + opcode->bytes[opcode->nbytes++] = op; + pfx = insn_last_prefix(insn); + insn->attr = inat_get_escape_attribute(op, pfx, insn->attr); + } + if (inat_must_vex(insn->attr)) + insn->attr = 0; /* This instruction is bad */ +end: + opcode->got = 1; +} + +/** + * insn_get_modrm - collect ModRM byte, if any + * @insn: &struct insn containing instruction + * + * Populates @insn->modrm and updates @insn->next_byte to point past the + * ModRM byte, if any. If necessary, first collects the preceding bytes + * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. + */ +void insn_get_modrm(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + insn_byte_t pfx, mod; + if (modrm->got) + return; + if (!insn->opcode.got) + insn_get_opcode(insn); + + if (inat_has_modrm(insn->attr)) { + mod = get_next(insn_byte_t, insn); + modrm->value = mod; + modrm->nbytes = 1; + if (inat_is_group(insn->attr)) { + pfx = insn_last_prefix(insn); + insn->attr = inat_get_group_attribute(mod, pfx, + insn->attr); + } + } + + if (insn->x86_64 && inat_is_force64(insn->attr)) + insn->opnd_bytes = 8; + modrm->got = 1; +} + + +/** + * insn_rip_relative() - Does instruction use RIP-relative addressing mode? + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. No effect if @insn->x86_64 is 0. + */ +int insn_rip_relative(struct insn *insn) +{ + struct insn_field *modrm = &insn->modrm; + + if (!insn->x86_64) + return 0; + if (!modrm->got) + insn_get_modrm(insn); + /* + * For rip-relative instructions, the mod field (top 2 bits) + * is zero and the r/m field (bottom 3 bits) is 0x5. + */ + return (modrm->nbytes && (modrm->value & 0xc7) == 0x5); +} + +/** + * insn_get_sib() - Get the SIB byte of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * ModRM byte. + */ +void insn_get_sib(struct insn *insn) +{ + insn_byte_t modrm; + + if (insn->sib.got) + return; + if (!insn->modrm.got) + insn_get_modrm(insn); + if (insn->modrm.nbytes) { + modrm = (insn_byte_t)insn->modrm.value; + if (insn->addr_bytes != 2 && + X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { + insn->sib.value = get_next(insn_byte_t, insn); + insn->sib.nbytes = 1; + } + } + insn->sib.got = 1; +} + + +/** + * insn_get_displacement() - Get the displacement of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * SIB byte. + * Displacement value is sign-expanded. + */ +void insn_get_displacement(struct insn *insn) +{ + insn_byte_t mod, rm, base; + + if (insn->displacement.got) + return; + if (!insn->sib.got) + insn_get_sib(insn); + if (insn->modrm.nbytes) { + /* + * Interpreting the modrm byte: + * mod = 00 - no displacement fields (exceptions below) + * mod = 01 - 1-byte displacement field + * mod = 10 - displacement field is 4 bytes, or 2 bytes if + * address size = 2 (0x67 prefix in 32-bit mode) + * mod = 11 - no memory operand + * + * If address size = 2... + * mod = 00, r/m = 110 - displacement field is 2 bytes + * + * If address size != 2... + * mod != 11, r/m = 100 - SIB byte exists + * mod = 00, SIB base = 101 - displacement field is 4 bytes + * mod = 00, r/m = 101 - rip-relative addressing, displacement + * field is 4 bytes + */ + mod = X86_MODRM_MOD(insn->modrm.value); + rm = X86_MODRM_RM(insn->modrm.value); + base = X86_SIB_BASE(insn->sib.value); + if (mod == 3) + goto out; + if (mod == 1) { + insn->displacement.value = get_next(char, insn); + insn->displacement.nbytes = 1; + } else if (insn->addr_bytes == 2) { + if ((mod == 0 && rm == 6) || mod == 2) { + insn->displacement.value = + get_next(short, insn); + insn->displacement.nbytes = 2; + } + } else { + if ((mod == 0 && rm == 5) || mod == 2 || + (mod == 0 && base == 5)) { + insn->displacement.value = get_next(int, insn); + insn->displacement.nbytes = 4; + } + } + } +out: + insn->displacement.got = 1; +} + +/* Decode moffset16/32/64 */ +static void __get_moffset(struct insn *insn) +{ + switch (insn->addr_bytes) { + case 2: + insn->moffset1.value = get_next(short, insn); + insn->moffset1.nbytes = 2; + break; + case 4: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + break; + case 8: + insn->moffset1.value = get_next(int, insn); + insn->moffset1.nbytes = 4; + insn->moffset2.value = get_next(int, insn); + insn->moffset2.nbytes = 4; + break; + } + insn->moffset1.got = insn->moffset2.got = 1; +} + +/* Decode imm v32(Iz) */ +static void __get_immv32(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case 4: + case 8: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + } +} + +/* Decode imm v64(Iv/Ov) */ +static void __get_immv(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + } + insn->immediate1.got = insn->immediate2.got = 1; +} + +/* Decode ptr16:16/32(Ap) */ +static void __get_immptr(struct insn *insn) +{ + switch (insn->opnd_bytes) { + case 2: + insn->immediate1.value = get_next(short, insn); + insn->immediate1.nbytes = 2; + break; + case 4: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + break; + case 8: + /* ptr16:64 is not exist (no segment) */ + return; + } + insn->immediate2.value = get_next(unsigned short, insn); + insn->immediate2.nbytes = 2; + insn->immediate1.got = insn->immediate2.got = 1; +} + +/** + * insn_get_immediate() - Get the immediates of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * displacement bytes. + * Basically, most of immediates are sign-expanded. Unsigned-value can be + * get by bit masking with ((1 << (nbytes * 8)) - 1) + */ +void insn_get_immediate(struct insn *insn) +{ + if (insn->immediate.got) + return; + if (!insn->displacement.got) + insn_get_displacement(insn); + + if (inat_has_moffset(insn->attr)) { + __get_moffset(insn); + goto done; + } + + if (!inat_has_immediate(insn->attr)) + /* no immediates */ + goto done; + + switch (inat_immediate_size(insn->attr)) { + case INAT_IMM_BYTE: + insn->immediate.value = get_next(char, insn); + insn->immediate.nbytes = 1; + break; + case INAT_IMM_WORD: + insn->immediate.value = get_next(short, insn); + insn->immediate.nbytes = 2; + break; + case INAT_IMM_DWORD: + insn->immediate.value = get_next(int, insn); + insn->immediate.nbytes = 4; + break; + case INAT_IMM_QWORD: + insn->immediate1.value = get_next(int, insn); + insn->immediate1.nbytes = 4; + insn->immediate2.value = get_next(int, insn); + insn->immediate2.nbytes = 4; + break; + case INAT_IMM_PTR: + __get_immptr(insn); + break; + case INAT_IMM_VWORD32: + __get_immv32(insn); + break; + case INAT_IMM_VWORD: + __get_immv(insn); + break; + default: + break; + } + if (inat_has_second_immediate(insn->attr)) { + insn->immediate2.value = get_next(char, insn); + insn->immediate2.nbytes = 1; + } +done: + insn->immediate.got = 1; +} + +/** + * insn_get_length() - Get the length of instruction + * @insn: &struct insn containing instruction + * + * If necessary, first collects the instruction up to and including the + * immediates bytes. + */ +void insn_get_length(struct insn *insn) +{ + if (insn->length) + return; + if (!insn->immediate.got) + insn_get_immediate(insn); + insn->length = (unsigned char)((unsigned long)insn->next_byte + - (unsigned long)insn->kaddr); +} diff -uprN linux-2.6.32/arch/x86/lib/Makefile linux-2.6.32.ovz/arch/x86/lib/Makefile --- linux-2.6.32/arch/x86/lib/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/Makefile 2015-03-10 13:23:53.000000000 -0700 @@ -2,14 +2,27 @@ # Makefile for x86 specific library files. # -obj-$(CONFIG_SMP) := msr.o +inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk +inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt +quiet_cmd_inat_tables = GEN $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + +$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) + $(call cmd,inat_tables) + +$(obj)/inat.o: $(obj)/inat-tables.c + +clean-files := inat-tables.c + +obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o lib-y += thunk_$(BITS).o -lib-y += usercopy_$(BITS).o getuser.o putuser.o +lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o +lib-y += insn.o inat.o -obj-y += msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o @@ -26,4 +39,5 @@ else lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o + lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o endif diff -uprN linux-2.6.32/arch/x86/lib/memcpy_32.c linux-2.6.32.ovz/arch/x86/lib/memcpy_32.c --- linux-2.6.32/arch/x86/lib/memcpy_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/memcpy_32.c 2015-03-10 13:22:35.000000000 -0700 @@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset); void *memmove(void *dest, const void *src, size_t n) { - int d0, d1, d2; + int d0,d1,d2,d3,d4,d5; + char *ret = dest; + + __asm__ __volatile__( + /* Handle more 16bytes in loop */ + "cmp $0x10, %0\n\t" + "jb 1f\n\t" + + /* Decide forward/backward copy mode */ + "cmp %2, %1\n\t" + "jb 2f\n\t" + + /* + * movs instruction have many startup latency + * so we handle small size by general register. + */ + "cmp $680, %0\n\t" + "jb 3f\n\t" + /* + * movs instruction is only good for aligned case. + */ + "mov %1, %3\n\t" + "xor %2, %3\n\t" + "and $0xff, %3\n\t" + "jz 4f\n\t" + "3:\n\t" + "sub $0x10, %0\n\t" + + /* + * We gobble 16byts forward in each loop. + */ + "3:\n\t" + "sub $0x10, %0\n\t" + "mov 0*4(%1), %3\n\t" + "mov 1*4(%1), %4\n\t" + "mov %3, 0*4(%2)\n\t" + "mov %4, 1*4(%2)\n\t" + "mov 2*4(%1), %3\n\t" + "mov 3*4(%1), %4\n\t" + "mov %3, 2*4(%2)\n\t" + "mov %4, 3*4(%2)\n\t" + "lea 0x10(%1), %1\n\t" + "lea 0x10(%2), %2\n\t" + "jae 3b\n\t" + "add $0x10, %0\n\t" + "jmp 1f\n\t" + + /* + * Handle data forward by movs. + */ + ".p2align 4\n\t" + "4:\n\t" + "mov -4(%1, %0), %3\n\t" + "lea -4(%2, %0), %4\n\t" + "shr $2, %0\n\t" + "rep movsl\n\t" + "mov %3, (%4)\n\t" + "jmp 11f\n\t" + /* + * Handle data backward by movs. + */ + ".p2align 4\n\t" + "6:\n\t" + "mov (%1), %3\n\t" + "mov %2, %4\n\t" + "lea -4(%1, %0), %1\n\t" + "lea -4(%2, %0), %2\n\t" + "shr $2, %0\n\t" + "std\n\t" + "rep movsl\n\t" + "mov %3,(%4)\n\t" + "cld\n\t" + "jmp 11f\n\t" + + /* + * Start to prepare for backward copy. + */ + ".p2align 4\n\t" + "2:\n\t" + "cmp $680, %0\n\t" + "jb 5f\n\t" + "mov %1, %3\n\t" + "xor %2, %3\n\t" + "and $0xff, %3\n\t" + "jz 6b\n\t" + + /* + * Calculate copy position to tail. + */ + "5:\n\t" + "add %0, %1\n\t" + "add %0, %2\n\t" + "sub $0x10, %0\n\t" + + /* + * We gobble 16byts backward in each loop. + */ + "7:\n\t" + "sub $0x10, %0\n\t" + + "mov -1*4(%1), %3\n\t" + "mov -2*4(%1), %4\n\t" + "mov %3, -1*4(%2)\n\t" + "mov %4, -2*4(%2)\n\t" + "mov -3*4(%1), %3\n\t" + "mov -4*4(%1), %4\n\t" + "mov %3, -3*4(%2)\n\t" + "mov %4, -4*4(%2)\n\t" + "lea -0x10(%1), %1\n\t" + "lea -0x10(%2), %2\n\t" + "jae 7b\n\t" + /* + * Calculate copy position to head. + */ + "add $0x10, %0\n\t" + "sub %0, %1\n\t" + "sub %0, %2\n\t" + + /* + * Move data from 8 bytes to 15 bytes. + */ + ".p2align 4\n\t" + "1:\n\t" + "cmp $8, %0\n\t" + "jb 8f\n\t" + "mov 0*4(%1), %3\n\t" + "mov 1*4(%1), %4\n\t" + "mov -2*4(%1, %0), %5\n\t" + "mov -1*4(%1, %0), %1\n\t" + + "mov %3, 0*4(%2)\n\t" + "mov %4, 1*4(%2)\n\t" + "mov %5, -2*4(%2, %0)\n\t" + "mov %1, -1*4(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data from 4 bytes to 7 bytes. + */ + ".p2align 4\n\t" + "8:\n\t" + "cmp $4, %0\n\t" + "jb 9f\n\t" + "mov 0*4(%1), %3\n\t" + "mov -1*4(%1, %0), %4\n\t" + "mov %3, 0*4(%2)\n\t" + "mov %4, -1*4(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data from 2 bytes to 3 bytes. + */ + ".p2align 4\n\t" + "9:\n\t" + "cmp $2, %0\n\t" + "jb 10f\n\t" + "movw 0*2(%1), %%dx\n\t" + "movw -1*2(%1, %0), %%bx\n\t" + "movw %%dx, 0*2(%2)\n\t" + "movw %%bx, -1*2(%2, %0)\n\t" + "jmp 11f\n\t" + + /* + * Move data for 1 byte. + */ + ".p2align 4\n\t" + "10:\n\t" + "cmp $1, %0\n\t" + "jb 11f\n\t" + "movb (%1), %%cl\n\t" + "movb %%cl, (%2)\n\t" + ".p2align 4\n\t" + "11:" + : "=&c" (d0), "=&S" (d1), "=&D" (d2), + "=r" (d3),"=r" (d4), "=r"(d5) + :"0" (n), + "1" (src), + "2" (dest) + :"memory"); + + return ret; - if (dest < src) { - memcpy(dest, src, n); - } else { - __asm__ __volatile__( - "std\n\t" - "rep\n\t" - "movsb\n\t" - "cld" - : "=&c" (d0), "=&S" (d1), "=&D" (d2) - :"0" (n), - "1" (n-1+src), - "2" (n-1+dest) - :"memory"); - } - return dest; } EXPORT_SYMBOL(memmove); diff -uprN linux-2.6.32/arch/x86/lib/memcpy_64.S linux-2.6.32.ovz/arch/x86/lib/memcpy_64.S --- linux-2.6.32/arch/x86/lib/memcpy_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/memcpy_64.S 2015-03-10 13:22:35.000000000 -0700 @@ -4,6 +4,7 @@ #include #include +#include /* * memcpy - Copy a memory block. @@ -20,12 +21,11 @@ /* * memcpy_c() - fast string ops (REP MOVSQ) based variant. * - * Calls to this get patched into the kernel image via the + * This gets patched over the unrolled variant (below) via the * alternative instructions framework: */ - ALIGN -memcpy_c: - CFI_STARTPROC + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c: movq %rdi, %rax movl %edx, %ecx @@ -35,116 +35,176 @@ memcpy_c: movl %edx, %ecx rep movsb ret - CFI_ENDPROC -ENDPROC(memcpy_c) +.Lmemcpy_e: + .previous + +/* + * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than + * memcpy_c. Use memcpy_c_e when possible. + * + * This gets patched over the unrolled variant (below) via the + * alternative instructions framework: + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemcpy_c_e: + movq %rdi, %rax + + movl %edx, %ecx + rep movsb + ret +.Lmemcpy_e_e: + .previous ENTRY(__memcpy) ENTRY(memcpy) CFI_STARTPROC + movq %rdi, %rax /* - * Put the number of full 64-byte blocks into %ecx. - * Tail portion is handled at the end: + * Use 32bit CMP here to avoid long NOP padding. */ - movq %rdi, %rax - movl %edx, %ecx - shrl $6, %ecx - jz .Lhandle_tail + cmp $0x20, %edx + jb .Lhandle_tail - .p2align 4 -.Lloop_64: /* - * We decrement the loop index here - and the zero-flag is - * checked at the end of the loop (instructions inbetween do - * not change the zero flag): + * We check whether memory false dependence could occur, + * then jump to corresponding copy mode. */ - decl %ecx + cmp %dil, %sil + jl .Lcopy_backward + subl $0x20, %edx +.Lcopy_forward_loop: + subq $0x20, %rdx /* - * Move in blocks of 4x16 bytes: + * Move in blocks of 4x8 bytes: */ - movq 0*8(%rsi), %r11 - movq 1*8(%rsi), %r8 - movq %r11, 0*8(%rdi) - movq %r8, 1*8(%rdi) - - movq 2*8(%rsi), %r9 - movq 3*8(%rsi), %r10 - movq %r9, 2*8(%rdi) - movq %r10, 3*8(%rdi) + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq 2*8(%rsi), %r10 + movq 3*8(%rsi), %r11 + leaq 4*8(%rsi), %rsi + + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, 2*8(%rdi) + movq %r11, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae .Lcopy_forward_loop + addq $0x20, %rdx + jmp .Lhandle_tail - movq 4*8(%rsi), %r11 - movq 5*8(%rsi), %r8 - movq %r11, 4*8(%rdi) - movq %r8, 5*8(%rdi) - - movq 6*8(%rsi), %r9 - movq 7*8(%rsi), %r10 - movq %r9, 6*8(%rdi) - movq %r10, 7*8(%rdi) - - leaq 64(%rsi), %rsi - leaq 64(%rdi), %rdi - - jnz .Lloop_64 +.Lcopy_backward: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * At most 3 ALU operations in one cycle, + * so append NOPS in the same 16bytes trunk. + */ + .p2align 4 +.Lcopy_backward_loop: + subq $0x20, %rdx + movq -1*8(%rsi), %r8 + movq -2*8(%rsi), %r9 + movq -3*8(%rsi), %r10 + movq -4*8(%rsi), %r11 + leaq -4*8(%rsi), %rsi + movq %r8, -1*8(%rdi) + movq %r9, -2*8(%rdi) + movq %r10, -3*8(%rdi) + movq %r11, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae .Lcopy_backward_loop + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi .Lhandle_tail: - movl %edx, %ecx - andl $63, %ecx - shrl $3, %ecx - jz .Lhandle_7 + cmpq $16, %rdx + jb .Lless_16bytes + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r8 + movq 1*8(%rsi), %r9 + movq -2*8(%rsi, %rdx), %r10 + movq -1*8(%rsi, %rdx), %r11 + movq %r8, 0*8(%rdi) + movq %r9, 1*8(%rdi) + movq %r10, -2*8(%rdi, %rdx) + movq %r11, -1*8(%rdi, %rdx) + retq .p2align 4 -.Lloop_8: - decl %ecx - movq (%rsi), %r8 - movq %r8, (%rdi) - leaq 8(%rdi), %rdi - leaq 8(%rsi), %rsi - jnz .Lloop_8 - -.Lhandle_7: - movl %edx, %ecx - andl $7, %ecx - jz .Lend +.Lless_16bytes: + cmpq $8, %rdx + jb .Lless_8bytes + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r8 + movq -1*8(%rsi, %rdx), %r9 + movq %r8, 0*8(%rdi) + movq %r9, -1*8(%rdi, %rdx) + retq + .p2align 4 +.Lless_8bytes: + cmpq $4, %rdx + jb .Lless_3bytes + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %ecx + movl -4(%rsi, %rdx), %r8d + movl %ecx, (%rdi) + movl %r8d, -4(%rdi, %rdx) + retq .p2align 4 +.Lless_3bytes: + cmpl $0, %edx + je .Lend + /* + * Move data from 1 bytes to 3 bytes. + */ .Lloop_1: movb (%rsi), %r8b movb %r8b, (%rdi) incq %rdi incq %rsi - decl %ecx + decl %edx jnz .Lloop_1 .Lend: - ret + retq CFI_ENDPROC ENDPROC(memcpy) ENDPROC(__memcpy) /* - * Some CPUs run faster using the string copy instructions. - * It is also a lot simpler. Use this when possible: - */ - - .section .altinstr_replacement, "ax" -1: .byte 0xeb /* jmp */ - .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */ -2: - .previous - - .section .altinstructions, "a" - .align 8 - .quad memcpy - .quad 1b - .byte X86_FEATURE_REP_GOOD - - /* + * Some CPUs are adding enhanced REP MOVSB/STOSB feature + * If the feature is supported, memcpy_c_e() is the first choice. + * If enhanced rep movsb copy is not available, use fast string copy + * memcpy_c() when possible. This is faster and code is simpler than + * original memcpy(). + * Otherwise, original memcpy() is used. + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + * * Replace only beginning, memcpy is used to apply alternatives, * so it is silly to overwrite itself with nops - reboot is the * only outcome... */ - .byte 2b - 1b - .byte 2b - 1b + .section .altinstructions, "a" + altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e .previous diff -uprN linux-2.6.32/arch/x86/lib/memmove_64.c linux-2.6.32.ovz/arch/x86/lib/memmove_64.c --- linux-2.6.32/arch/x86/lib/memmove_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/memmove_64.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,21 +0,0 @@ -/* Normally compiler builtins are used, but sometimes the compiler calls out - of line code. Based on asm-i386/string.h. - */ -#define _STRING_C -#include -#include - -#undef memmove -void *memmove(void *dest, const void *src, size_t count) -{ - if (dest < src) { - return memcpy(dest, src, count); - } else { - char *p = dest + count; - const char *s = src + count; - while (count--) - *--p = *--s; - } - return dest; -} -EXPORT_SYMBOL(memmove); diff -uprN linux-2.6.32/arch/x86/lib/memmove_64.S linux-2.6.32.ovz/arch/x86/lib/memmove_64.S --- linux-2.6.32/arch/x86/lib/memmove_64.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/memmove_64.S 2015-03-10 13:22:35.000000000 -0700 @@ -0,0 +1,224 @@ +/* + * Normally compiler builtins are used, but sometimes the compiler calls out + * of line code. Based on asm-i386/string.h. + * + * This assembly file is re-written from memmove_64.c file. + * - Copyright 2011 Fenghua Yu + */ +#define _STRING_C +#include +#include +#include + +#undef memmove + +/* + * Implement memmove(). This can handle overlap between src and dst. + * + * Input: + * rdi: dest + * rsi: src + * rdx: count + * + * Output: + * rax: dest + */ +ENTRY(memmove) + CFI_STARTPROC + + /* Handle more 32bytes in loop */ + mov %rdi, %rax + cmp $0x20, %rdx + jb 1f + + /* Decide forward/backward copy mode */ + cmp %rdi, %rsi + jge .Lmemmove_begin_forward + mov %rsi, %r8 + add %rdx, %r8 + cmp %rdi, %r8 + jg 2f + +.Lmemmove_begin_forward: + /* + * movsq instruction have many startup latency + * so we handle small size by general register. + */ + cmp $680, %rdx + jb 3f + /* + * movsq instruction is only good for aligned case. + */ + + cmpb %dil, %sil + je 4f +3: + sub $0x20, %rdx + /* + * We gobble 32byts forward in each loop. + */ +5: + sub $0x20, %rdx + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq 2*8(%rsi), %r9 + movq 3*8(%rsi), %r8 + leaq 4*8(%rsi), %rsi + + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, 2*8(%rdi) + movq %r8, 3*8(%rdi) + leaq 4*8(%rdi), %rdi + jae 5b + addq $0x20, %rdx + jmp 1f + /* + * Handle data forward by movsq. + */ + .p2align 4 +4: + movq %rdx, %rcx + movq -8(%rsi, %rdx), %r11 + lea -8(%rdi, %rdx), %r10 + shrq $3, %rcx + rep movsq + movq %r11, (%r10) + jmp 13f +.Lmemmove_end_forward: + + /* + * Handle data backward by movsq. + */ + .p2align 4 +7: + movq %rdx, %rcx + movq (%rsi), %r11 + movq %rdi, %r10 + leaq -8(%rsi, %rdx), %rsi + leaq -8(%rdi, %rdx), %rdi + shrq $3, %rcx + std + rep movsq + cld + movq %r11, (%r10) + jmp 13f + + /* + * Start to prepare for backward copy. + */ + .p2align 4 +2: + cmp $680, %rdx + jb 6f + cmp %dil, %sil + je 7b +6: + /* + * Calculate copy position to tail. + */ + addq %rdx, %rsi + addq %rdx, %rdi + subq $0x20, %rdx + /* + * We gobble 32byts backward in each loop. + */ +8: + subq $0x20, %rdx + movq -1*8(%rsi), %r11 + movq -2*8(%rsi), %r10 + movq -3*8(%rsi), %r9 + movq -4*8(%rsi), %r8 + leaq -4*8(%rsi), %rsi + + movq %r11, -1*8(%rdi) + movq %r10, -2*8(%rdi) + movq %r9, -3*8(%rdi) + movq %r8, -4*8(%rdi) + leaq -4*8(%rdi), %rdi + jae 8b + /* + * Calculate copy position to head. + */ + addq $0x20, %rdx + subq %rdx, %rsi + subq %rdx, %rdi +1: + cmpq $16, %rdx + jb 9f + /* + * Move data from 16 bytes to 31 bytes. + */ + movq 0*8(%rsi), %r11 + movq 1*8(%rsi), %r10 + movq -2*8(%rsi, %rdx), %r9 + movq -1*8(%rsi, %rdx), %r8 + movq %r11, 0*8(%rdi) + movq %r10, 1*8(%rdi) + movq %r9, -2*8(%rdi, %rdx) + movq %r8, -1*8(%rdi, %rdx) + jmp 13f + .p2align 4 +9: + cmpq $8, %rdx + jb 10f + /* + * Move data from 8 bytes to 15 bytes. + */ + movq 0*8(%rsi), %r11 + movq -1*8(%rsi, %rdx), %r10 + movq %r11, 0*8(%rdi) + movq %r10, -1*8(%rdi, %rdx) + jmp 13f +10: + cmpq $4, %rdx + jb 11f + /* + * Move data from 4 bytes to 7 bytes. + */ + movl (%rsi), %r11d + movl -4(%rsi, %rdx), %r10d + movl %r11d, (%rdi) + movl %r10d, -4(%rdi, %rdx) + jmp 13f +11: + cmp $2, %rdx + jb 12f + /* + * Move data from 2 bytes to 3 bytes. + */ + movw (%rsi), %r11w + movw -2(%rsi, %rdx), %r10w + movw %r11w, (%rdi) + movw %r10w, -2(%rdi, %rdx) + jmp 13f +12: + cmp $1, %rdx + jb 13f + /* + * Move data for 1 byte. + */ + movb (%rsi), %r11b + movb %r11b, (%rdi) +13: + retq + CFI_ENDPROC + + .section .altinstr_replacement,"ax" +.Lmemmove_begin_forward_efs: + /* Forward moving data. */ + movq %rdx, %rcx + rep movsb + retq +.Lmemmove_end_forward_efs: + .previous + + .section .altinstructions,"a" + .align 8 + .quad .Lmemmove_begin_forward + .quad .Lmemmove_begin_forward_efs + .word X86_FEATURE_ERMS + .byte .Lmemmove_end_forward-.Lmemmove_begin_forward + .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + .previous +ENDPROC(memmove) diff -uprN linux-2.6.32/arch/x86/lib/memset_64.S linux-2.6.32.ovz/arch/x86/lib/memset_64.S --- linux-2.6.32/arch/x86/lib/memset_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/memset_64.S 2015-03-10 13:22:35.000000000 -0700 @@ -2,9 +2,13 @@ #include #include +#include +#include /* - * ISO C memset - set a memory block to a byte value. + * ISO C memset - set a memory block to a byte value. This function uses fast + * string to get better performance than the original function. The code is + * simpler and shorter than the orignal function as well. * * rdi destination * rsi value (char) @@ -12,9 +16,8 @@ * * rax original destination */ - ALIGN -memset_c: - CFI_STARTPROC + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c: movq %rdi,%r9 movl %edx,%r8d andl $7,%r8d @@ -29,8 +32,30 @@ memset_c: rep stosb movq %r9,%rax ret - CFI_ENDPROC -ENDPROC(memset_c) +.Lmemset_e: + .previous + +/* + * ISO C memset - set a memory block to a byte value. This function uses + * enhanced rep stosb to override the fast string function. + * The code is simpler and shorter than the fast string function as well. + * + * rdi destination + * rsi value (char) + * rdx count (bytes) + * + * rax original destination + */ + .section .altinstr_replacement, "ax", @progbits +.Lmemset_c_e: + movq %rdi,%r9 + movb %sil,%al + movl %edx,%ecx + rep stosb + movq %r9,%rax + ret +.Lmemset_e_e: + .previous ENTRY(memset) ENTRY(__memset) @@ -113,21 +138,20 @@ ENTRY(__memset) ENDPROC(memset) ENDPROC(__memset) - /* Some CPUs run faster using the string instructions. - It is also a lot simpler. Use this when possible */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (memset_c - memset) - (2f - 1b) /* offset */ -2: - .previous + /* Some CPUs support enhanced REP MOVSB/STOSB feature. + * It is recommended to use this when possible. + * + * If enhanced REP MOVSB/STOSB feature is not available, use fast string + * instructions. + * + * Otherwise, use original memset function. + * + * In .altinstructions section, ERMS feature is placed after REG_GOOD + * feature to implement the right patch order. + */ .section .altinstructions,"a" - .align 8 - .quad memset - .quad 1b - .byte X86_FEATURE_REP_GOOD - .byte .Lfinal - memset - .byte 2b - 1b + altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ + .Lfinal-memset,.Lmemset_e-.Lmemset_c + altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ + .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e .previous diff -uprN linux-2.6.32/arch/x86/lib/msr.c linux-2.6.32.ovz/arch/x86/lib/msr.c --- linux-2.6.32/arch/x86/lib/msr.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/msr.c 2015-03-10 13:21:07.000000000 -0700 @@ -1,226 +1,23 @@ #include #include -#include #include -struct msr_info { - u32 msr_no; - struct msr reg; - struct msr *msrs; - int off; - int err; -}; - -static void __rdmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - struct msr *reg; - int this_cpu = raw_smp_processor_id(); - - if (rv->msrs) - reg = &rv->msrs[this_cpu - rv->off]; - else - reg = &rv->reg; - - rdmsr(rv->msr_no, reg->l, reg->h); -} - -static void __wrmsr_on_cpu(void *info) -{ - struct msr_info *rv = info; - struct msr *reg; - int this_cpu = raw_smp_processor_id(); - - if (rv->msrs) - reg = &rv->msrs[this_cpu - rv->off]; - else - reg = &rv->reg; - - wrmsr(rv->msr_no, reg->l, reg->h); -} - -int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); - *l = rv.reg.l; - *h = rv.reg.h; - - return err; -} -EXPORT_SYMBOL(rdmsr_on_cpu); - -int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - rv.reg.l = l; - rv.reg.h = h; - err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); - - return err; -} -EXPORT_SYMBOL(wrmsr_on_cpu); - -/* rdmsr on a bunch of CPUs - * - * @mask: which CPUs - * @msr_no: which MSR - * @msrs: array of MSR values - * - */ -void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) -{ - struct msr_info rv; - int this_cpu; - - memset(&rv, 0, sizeof(rv)); - - rv.off = cpumask_first(mask); - rv.msrs = msrs; - rv.msr_no = msr_no; - - this_cpu = get_cpu(); - - if (cpumask_test_cpu(this_cpu, mask)) - __rdmsr_on_cpu(&rv); - - smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1); - put_cpu(); -} -EXPORT_SYMBOL(rdmsr_on_cpus); - -/* - * wrmsr on a bunch of CPUs - * - * @mask: which CPUs - * @msr_no: which MSR - * @msrs: array of MSR values - * - */ -void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs) -{ - struct msr_info rv; - int this_cpu; - - memset(&rv, 0, sizeof(rv)); - - rv.off = cpumask_first(mask); - rv.msrs = msrs; - rv.msr_no = msr_no; - - this_cpu = get_cpu(); - - if (cpumask_test_cpu(this_cpu, mask)) - __wrmsr_on_cpu(&rv); - - smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1); - put_cpu(); -} -EXPORT_SYMBOL(wrmsr_on_cpus); - -/* These "safe" variants are slower and should be used when the target MSR - may not actually exist. */ -static void __rdmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); -} - -static void __wrmsr_safe_on_cpu(void *info) -{ - struct msr_info *rv = info; - - rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); -} - -int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +struct msr *msrs_alloc(void) { - int err; - struct msr_info rv; + struct msr *msrs = NULL; - memset(&rv, 0, sizeof(rv)); + msrs = alloc_percpu(struct msr); + if (!msrs) { + pr_warning("%s: error allocating msrs\n", __func__); + return NULL; + } - rv.msr_no = msr_no; - err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); - *l = rv.reg.l; - *h = rv.reg.h; - - return err ? err : rv.err; + return msrs; } -EXPORT_SYMBOL(rdmsr_safe_on_cpu); +EXPORT_SYMBOL(msrs_alloc); -int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +void msrs_free(struct msr *msrs) { - int err; - struct msr_info rv; - - memset(&rv, 0, sizeof(rv)); - - rv.msr_no = msr_no; - rv.reg.l = l; - rv.reg.h = h; - err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); - - return err ? err : rv.err; -} -EXPORT_SYMBOL(wrmsr_safe_on_cpu); - -/* - * These variants are significantly slower, but allows control over - * the entire 32-bit GPR set. - */ -struct msr_regs_info { - u32 *regs; - int err; -}; - -static void __rdmsr_safe_regs_on_cpu(void *info) -{ - struct msr_regs_info *rv = info; - - rv->err = rdmsr_safe_regs(rv->regs); -} - -static void __wrmsr_safe_regs_on_cpu(void *info) -{ - struct msr_regs_info *rv = info; - - rv->err = wrmsr_safe_regs(rv->regs); -} - -int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) -{ - int err; - struct msr_regs_info rv; - - rv.regs = regs; - rv.err = -EIO; - err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); - - return err ? err : rv.err; -} -EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); - -int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) -{ - int err; - struct msr_regs_info rv; - - rv.regs = regs; - rv.err = -EIO; - err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); - - return err ? err : rv.err; + free_percpu(msrs); } -EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); +EXPORT_SYMBOL(msrs_free); diff -uprN linux-2.6.32/arch/x86/lib/msr-smp.c linux-2.6.32.ovz/arch/x86/lib/msr-smp.c --- linux-2.6.32/arch/x86/lib/msr-smp.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/msr-smp.c 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,266 @@ +#include +#include +#include +#include + +static void __rdmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = per_cpu_ptr(rv->msrs, this_cpu); + else + reg = &rv->reg; + + rdmsr(rv->msr_no, reg->l, reg->h); +} + +static void __wrmsr_on_cpu(void *info) +{ + struct msr_info *rv = info; + struct msr *reg; + int this_cpu = raw_smp_processor_id(); + + if (rv->msrs) + reg = per_cpu_ptr(rv->msrs, this_cpu); + else + reg = &rv->reg; + + wrmsr(rv->msr_no, reg->l, reg->h); +} + +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err; +} +EXPORT_SYMBOL(rdmsr_on_cpu); + +int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + *q = rv.reg.q; + + return err; +} +EXPORT_SYMBOL(rdmsrl_on_cpu); + +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} +EXPORT_SYMBOL(wrmsr_on_cpu); + +int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.q = q; + + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} +EXPORT_SYMBOL(wrmsrl_on_cpu); + +static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, + struct msr *msrs, + void (*msr_func) (void *info)) +{ + struct msr_info rv; + int this_cpu; + + memset(&rv, 0, sizeof(rv)); + + rv.msrs = msrs; + rv.msr_no = msr_no; + + this_cpu = get_cpu(); + + if (cpumask_test_cpu(this_cpu, mask)) + msr_func(&rv); + + smp_call_function_many(mask, msr_func, &rv, 1); + put_cpu(); +} + +/* rdmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu); +} +EXPORT_SYMBOL(rdmsr_on_cpus); + +/* + * wrmsr on a bunch of CPUs + * + * @mask: which CPUs + * @msr_no: which MSR + * @msrs: array of MSR values + * + */ +void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs) +{ + __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu); +} +EXPORT_SYMBOL(wrmsr_on_cpus); + +/* These "safe" variants are slower and should be used when the target MSR + may not actually exist. */ +static void __rdmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h); +} + +static void __wrmsr_safe_on_cpu(void *info) +{ + struct msr_info *rv = info; + + rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h); +} + +int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *l = rv.reg.l; + *h = rv.reg.h; + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_on_cpu); + +int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.l = l; + rv.reg.h = h; + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_on_cpu); + +int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.q = q; + + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsrl_safe_on_cpu); + +int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *q = rv.reg.q; + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsrl_safe_on_cpu); + +/* + * These variants are significantly slower, but allows control over + * the entire 32-bit GPR set. + */ +static void __rdmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = rdmsr_safe_regs(rv->regs); +} + +static void __wrmsr_safe_regs_on_cpu(void *info) +{ + struct msr_regs_info *rv = info; + + rv->err = wrmsr_safe_regs(rv->regs); +} + +int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu); + +int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs) +{ + int err; + struct msr_regs_info rv; + + rv.regs = regs; + rv.err = -EIO; + err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); diff -uprN linux-2.6.32/arch/x86/lib/rwsem_64.S linux-2.6.32.ovz/arch/x86/lib/rwsem_64.S --- linux-2.6.32/arch/x86/lib/rwsem_64.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/rwsem_64.S 2015-03-10 13:21:14.000000000 -0700 @@ -0,0 +1,81 @@ +/* + * x86-64 rwsem wrappers + * + * This interfaces the inline asm code to the slow-path + * C routines. We need to save the call-clobbered regs + * that the asm does not mark as clobbered, and move the + * argument from %rax to %rdi. + * + * NOTE! We don't need to save %rax, because the functions + * will always return the semaphore pointer in %rax (which + * is also the input argument to these helpers) + * + * The following can clobber %rdx because the asm clobbers it: + * call_rwsem_down_write_failed + * call_rwsem_wake + * but %rdi, %rsi, %rcx, %r8-r11 always need saving. + */ + +#include +#include +#include +#include +#include + +#define save_common_regs \ + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 + +#define restore_common_regs \ + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_down_read_failed + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed + restore_common_regs + ret + ENDPROC(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + decw %dx /* do nothing if still outstanding active readers */ + jnz 1f + save_common_regs + movq %rax,%rdi + call rwsem_wake + restore_common_regs +1: ret + ENDPROC(call_rwsem_wake) + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_downgrade_wake) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_downgrade_wake + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_downgrade_wake) diff -uprN linux-2.6.32/arch/x86/lib/usercopy.c linux-2.6.32.ovz/arch/x86/lib/usercopy.c --- linux-2.6.32/arch/x86/lib/usercopy.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/usercopy.c 2015-03-10 13:23:53.000000000 -0700 @@ -0,0 +1,44 @@ +/* + * User address space access functions. + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} diff -uprN linux-2.6.32/arch/x86/lib/x86-opcode-map.txt linux-2.6.32.ovz/arch/x86/lib/x86-opcode-map.txt --- linux-2.6.32/arch/x86/lib/x86-opcode-map.txt 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/lib/x86-opcode-map.txt 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,893 @@ +# x86 Opcode Maps +# +# +# Table: table-name +# Referrer: escaped-name +# AVXcode: avx-code +# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# (or) +# opcode: escape # escaped-name +# EndTable +# +# +# GrpTable: GrpXXX +# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...] +# EndTable +# +# AVX Superscripts +# (VEX): this opcode can accept VEX prefix. +# (oVEX): this opcode requires VEX prefix. +# (o128): this opcode only supports 128bit VEX. +# (o256): this opcode only supports 256bit VEX. +# + +Table: one byte opcode +Referrer: +AVXcode: +# 0x00 - 0x0f +00: ADD Eb,Gb +01: ADD Ev,Gv +02: ADD Gb,Eb +03: ADD Gv,Ev +04: ADD AL,Ib +05: ADD rAX,Iz +06: PUSH ES (i64) +07: POP ES (i64) +08: OR Eb,Gb +09: OR Ev,Gv +0a: OR Gb,Eb +0b: OR Gv,Ev +0c: OR AL,Ib +0d: OR rAX,Iz +0e: PUSH CS (i64) +0f: escape # 2-byte escape +# 0x10 - 0x1f +10: ADC Eb,Gb +11: ADC Ev,Gv +12: ADC Gb,Eb +13: ADC Gv,Ev +14: ADC AL,Ib +15: ADC rAX,Iz +16: PUSH SS (i64) +17: POP SS (i64) +18: SBB Eb,Gb +19: SBB Ev,Gv +1a: SBB Gb,Eb +1b: SBB Gv,Ev +1c: SBB AL,Ib +1d: SBB rAX,Iz +1e: PUSH DS (i64) +1f: POP DS (i64) +# 0x20 - 0x2f +20: AND Eb,Gb +21: AND Ev,Gv +22: AND Gb,Eb +23: AND Gv,Ev +24: AND AL,Ib +25: AND rAx,Iz +26: SEG=ES (Prefix) +27: DAA (i64) +28: SUB Eb,Gb +29: SUB Ev,Gv +2a: SUB Gb,Eb +2b: SUB Gv,Ev +2c: SUB AL,Ib +2d: SUB rAX,Iz +2e: SEG=CS (Prefix) +2f: DAS (i64) +# 0x30 - 0x3f +30: XOR Eb,Gb +31: XOR Ev,Gv +32: XOR Gb,Eb +33: XOR Gv,Ev +34: XOR AL,Ib +35: XOR rAX,Iz +36: SEG=SS (Prefix) +37: AAA (i64) +38: CMP Eb,Gb +39: CMP Ev,Gv +3a: CMP Gb,Eb +3b: CMP Gv,Ev +3c: CMP AL,Ib +3d: CMP rAX,Iz +3e: SEG=DS (Prefix) +3f: AAS (i64) +# 0x40 - 0x4f +40: INC eAX (i64) | REX (o64) +41: INC eCX (i64) | REX.B (o64) +42: INC eDX (i64) | REX.X (o64) +43: INC eBX (i64) | REX.XB (o64) +44: INC eSP (i64) | REX.R (o64) +45: INC eBP (i64) | REX.RB (o64) +46: INC eSI (i64) | REX.RX (o64) +47: INC eDI (i64) | REX.RXB (o64) +48: DEC eAX (i64) | REX.W (o64) +49: DEC eCX (i64) | REX.WB (o64) +4a: DEC eDX (i64) | REX.WX (o64) +4b: DEC eBX (i64) | REX.WXB (o64) +4c: DEC eSP (i64) | REX.WR (o64) +4d: DEC eBP (i64) | REX.WRB (o64) +4e: DEC eSI (i64) | REX.WRX (o64) +4f: DEC eDI (i64) | REX.WRXB (o64) +# 0x50 - 0x5f +50: PUSH rAX/r8 (d64) +51: PUSH rCX/r9 (d64) +52: PUSH rDX/r10 (d64) +53: PUSH rBX/r11 (d64) +54: PUSH rSP/r12 (d64) +55: PUSH rBP/r13 (d64) +56: PUSH rSI/r14 (d64) +57: PUSH rDI/r15 (d64) +58: POP rAX/r8 (d64) +59: POP rCX/r9 (d64) +5a: POP rDX/r10 (d64) +5b: POP rBX/r11 (d64) +5c: POP rSP/r12 (d64) +5d: POP rBP/r13 (d64) +5e: POP rSI/r14 (d64) +5f: POP rDI/r15 (d64) +# 0x60 - 0x6f +60: PUSHA/PUSHAD (i64) +61: POPA/POPAD (i64) +62: BOUND Gv,Ma (i64) +63: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64) +64: SEG=FS (Prefix) +65: SEG=GS (Prefix) +66: Operand-Size (Prefix) +67: Address-Size (Prefix) +68: PUSH Iz (d64) +69: IMUL Gv,Ev,Iz +6a: PUSH Ib (d64) +6b: IMUL Gv,Ev,Ib +6c: INS/INSB Yb,DX +6d: INS/INSW/INSD Yz,DX +6e: OUTS/OUTSB DX,Xb +6f: OUTS/OUTSW/OUTSD DX,Xz +# 0x70 - 0x7f +70: JO Jb +71: JNO Jb +72: JB/JNAE/JC Jb +73: JNB/JAE/JNC Jb +74: JZ/JE Jb +75: JNZ/JNE Jb +76: JBE/JNA Jb +77: JNBE/JA Jb +78: JS Jb +79: JNS Jb +7a: JP/JPE Jb +7b: JNP/JPO Jb +7c: JL/JNGE Jb +7d: JNL/JGE Jb +7e: JLE/JNG Jb +7f: JNLE/JG Jb +# 0x80 - 0x8f +80: Grp1 Eb,Ib (1A) +81: Grp1 Ev,Iz (1A) +82: Grp1 Eb,Ib (1A),(i64) +83: Grp1 Ev,Ib (1A) +84: TEST Eb,Gb +85: TEST Ev,Gv +86: XCHG Eb,Gb +87: XCHG Ev,Gv +88: MOV Eb,Gb +89: MOV Ev,Gv +8a: MOV Gb,Eb +8b: MOV Gv,Ev +8c: MOV Ev,Sw +8d: LEA Gv,M +8e: MOV Sw,Ew +8f: Grp1A (1A) | POP Ev (d64) +# 0x90 - 0x9f +90: NOP | PAUSE (F3) | XCHG r8,rAX +91: XCHG rCX/r9,rAX +92: XCHG rDX/r10,rAX +93: XCHG rBX/r11,rAX +94: XCHG rSP/r12,rAX +95: XCHG rBP/r13,rAX +96: XCHG rSI/r14,rAX +97: XCHG rDI/r15,rAX +98: CBW/CWDE/CDQE +99: CWD/CDQ/CQO +9a: CALLF Ap (i64) +9b: FWAIT/WAIT +9c: PUSHF/D/Q Fv (d64) +9d: POPF/D/Q Fv (d64) +9e: SAHF +9f: LAHF +# 0xa0 - 0xaf +a0: MOV AL,Ob +a1: MOV rAX,Ov +a2: MOV Ob,AL +a3: MOV Ov,rAX +a4: MOVS/B Xb,Yb +a5: MOVS/W/D/Q Xv,Yv +a6: CMPS/B Xb,Yb +a7: CMPS/W/D Xv,Yv +a8: TEST AL,Ib +a9: TEST rAX,Iz +aa: STOS/B Yb,AL +ab: STOS/W/D/Q Yv,rAX +ac: LODS/B AL,Xb +ad: LODS/W/D/Q rAX,Xv +ae: SCAS/B AL,Yb +af: SCAS/W/D/Q rAX,Xv +# 0xb0 - 0xbf +b0: MOV AL/R8L,Ib +b1: MOV CL/R9L,Ib +b2: MOV DL/R10L,Ib +b3: MOV BL/R11L,Ib +b4: MOV AH/R12L,Ib +b5: MOV CH/R13L,Ib +b6: MOV DH/R14L,Ib +b7: MOV BH/R15L,Ib +b8: MOV rAX/r8,Iv +b9: MOV rCX/r9,Iv +ba: MOV rDX/r10,Iv +bb: MOV rBX/r11,Iv +bc: MOV rSP/r12,Iv +bd: MOV rBP/r13,Iv +be: MOV rSI/r14,Iv +bf: MOV rDI/r15,Iv +# 0xc0 - 0xcf +c0: Grp2 Eb,Ib (1A) +c1: Grp2 Ev,Ib (1A) +c2: RETN Iw (f64) +c3: RETN +c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix) +c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix) +c6: Grp11 Eb,Ib (1A) +c7: Grp11 Ev,Iz (1A) +c8: ENTER Iw,Ib +c9: LEAVE (d64) +ca: RETF Iw +cb: RETF +cc: INT3 +cd: INT Ib +ce: INTO (i64) +cf: IRET/D/Q +# 0xd0 - 0xdf +d0: Grp2 Eb,1 (1A) +d1: Grp2 Ev,1 (1A) +d2: Grp2 Eb,CL (1A) +d3: Grp2 Ev,CL (1A) +d4: AAM Ib (i64) +d5: AAD Ib (i64) +d6: +d7: XLAT/XLATB +d8: ESC +d9: ESC +da: ESC +db: ESC +dc: ESC +dd: ESC +de: ESC +df: ESC +# 0xe0 - 0xef +e0: LOOPNE/LOOPNZ Jb (f64) +e1: LOOPE/LOOPZ Jb (f64) +e2: LOOP Jb (f64) +e3: JrCXZ Jb (f64) +e4: IN AL,Ib +e5: IN eAX,Ib +e6: OUT Ib,AL +e7: OUT Ib,eAX +e8: CALL Jz (f64) +e9: JMP-near Jz (f64) +ea: JMP-far Ap (i64) +eb: JMP-short Jb (f64) +ec: IN AL,DX +ed: IN eAX,DX +ee: OUT DX,AL +ef: OUT DX,eAX +# 0xf0 - 0xff +f0: LOCK (Prefix) +f1: +f2: REPNE (Prefix) +f3: REP/REPE (Prefix) +f4: HLT +f5: CMC +f6: Grp3_1 Eb (1A) +f7: Grp3_2 Ev (1A) +f8: CLC +f9: STC +fa: CLI +fb: STI +fc: CLD +fd: STD +fe: Grp4 (1A) +ff: Grp5 (1A) +EndTable + +Table: 2-byte opcode (0x0f) +Referrer: 2-byte escape +AVXcode: 1 +# 0x0f 0x00-0x0f +00: Grp6 (1A) +01: Grp7 (1A) +02: LAR Gv,Ew +03: LSL Gv,Ew +04: +05: SYSCALL (o64) +06: CLTS +07: SYSRET (o64) +08: INVD +09: WBINVD +0a: +0b: UD2 (1B) +0c: +0d: NOP Ev | GrpP +0e: FEMMS +# 3DNow! uses the last imm byte as opcode extension. +0f: 3DNow! Pq,Qq,Ib +# 0x0f 0x10-0x1f +10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128) +11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128) +12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX) +13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128) +14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX) +15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX) +16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX) +17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128) +18: Grp16 (1A) +19: +1a: +1b: +1c: +1d: +1e: +1f: NOP Ev +# 0x0f 0x20-0x2f +20: MOV Rd,Cd +21: MOV Rd,Dd +22: MOV Cd,Rd +23: MOV Dd,Rd +24: +25: +26: +27: +28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX) +29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX) +2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128) +2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX) +2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128) +2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128) +2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128) +2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128) +# 0x0f 0x30-0x3f +30: WRMSR +31: RDTSC +32: RDMSR +33: RDPMC +34: SYSENTER +35: SYSEXIT +36: +37: GETSEC +38: escape # 3-byte escape 1 +39: +3a: escape # 3-byte escape 2 +3b: +3c: +3d: +3e: +3f: +# 0x0f 0x40-0x4f +40: CMOVO Gv,Ev +41: CMOVNO Gv,Ev +42: CMOVB/C/NAE Gv,Ev +43: CMOVAE/NB/NC Gv,Ev +44: CMOVE/Z Gv,Ev +45: CMOVNE/NZ Gv,Ev +46: CMOVBE/NA Gv,Ev +47: CMOVA/NBE Gv,Ev +48: CMOVS Gv,Ev +49: CMOVNS Gv,Ev +4a: CMOVP/PE Gv,Ev +4b: CMOVNP/PO Gv,Ev +4c: CMOVL/NGE Gv,Ev +4d: CMOVNL/GE Gv,Ev +4e: CMOVLE/NG Gv,Ev +4f: CMOVNLE/G Gv,Ev +# 0x0f 0x50-0x5f +50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX) +51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128) +52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128) +53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128) +54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX) +55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX) +56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX) +57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX) +58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128) +59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128) +5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128) +5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX) +5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128) +5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128) +5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128) +5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128) +# 0x0f 0x60-0x6f +60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128) +61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128) +62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128) +63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128) +64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128) +65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128) +66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128) +67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128) +68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128) +69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128) +6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128) +6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128) +6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128) +6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128) +6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128) +6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX) +# 0x0f 0x70-0x7f +70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128) +71: Grp12 (1A) +72: Grp13 (1A) +73: Grp14 (1A) +74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128) +75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128) +76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128) +77: emms/vzeroupper/vzeroall (VEX) +78: VMREAD Ed/q,Gd/q +79: VMWRITE Gd/q,Ed/q +7a: +7b: +7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX) +7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX) +7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128) +7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX) +# 0x0f 0x80-0x8f +80: JO Jz (f64) +81: JNO Jz (f64) +82: JB/JNAE/JC Jz (f64) +83: JNB/JAE/JNC Jz (f64) +84: JZ/JE Jz (f64) +85: JNZ/JNE Jz (f64) +86: JBE/JNA Jz (f64) +87: JNBE/JA Jz (f64) +88: JS Jz (f64) +89: JNS Jz (f64) +8a: JP/JPE Jz (f64) +8b: JNP/JPO Jz (f64) +8c: JL/JNGE Jz (f64) +8d: JNL/JGE Jz (f64) +8e: JLE/JNG Jz (f64) +8f: JNLE/JG Jz (f64) +# 0x0f 0x90-0x9f +90: SETO Eb +91: SETNO Eb +92: SETB/C/NAE Eb +93: SETAE/NB/NC Eb +94: SETE/Z Eb +95: SETNE/NZ Eb +96: SETBE/NA Eb +97: SETA/NBE Eb +98: SETS Eb +99: SETNS Eb +9a: SETP/PE Eb +9b: SETNP/PO Eb +9c: SETL/NGE Eb +9d: SETNL/GE Eb +9e: SETLE/NG Eb +9f: SETNLE/G Eb +# 0x0f 0xa0-0xaf +a0: PUSH FS (d64) +a1: POP FS (d64) +a2: CPUID +a3: BT Ev,Gv +a4: SHLD Ev,Gv,Ib +a5: SHLD Ev,Gv,CL +a6: GrpPDLK +a7: GrpRNG +a8: PUSH GS (d64) +a9: POP GS (d64) +aa: RSM +ab: BTS Ev,Gv +ac: SHRD Ev,Gv,Ib +ad: SHRD Ev,Gv,CL +ae: Grp15 (1A),(1C) +af: IMUL Gv,Ev +# 0x0f 0xb0-0xbf +b0: CMPXCHG Eb,Gb +b1: CMPXCHG Ev,Gv +b2: LSS Gv,Mp +b3: BTR Ev,Gv +b4: LFS Gv,Mp +b5: LGS Gv,Mp +b6: MOVZX Gv,Eb +b7: MOVZX Gv,Ew +b8: JMPE | POPCNT Gv,Ev (F3) +b9: Grp10 (1A) +ba: Grp8 Ev,Ib (1A) +bb: BTC Ev,Gv +bc: BSF Gv,Ev +bd: BSR Gv,Ev +be: MOVSX Gv,Eb +bf: MOVSX Gv,Ew +# 0x0f 0xc0-0xcf +c0: XADD Eb,Gb +c1: XADD Ev,Gv +c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX) +c3: movnti Md/q,Gd/q +c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128) +c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128) +c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX) +c7: Grp9 (1A) +c8: BSWAP RAX/EAX/R8/R8D +c9: BSWAP RCX/ECX/R9/R9D +ca: BSWAP RDX/EDX/R10/R10D +cb: BSWAP RBX/EBX/R11/R11D +cc: BSWAP RSP/ESP/R12/R12D +cd: BSWAP RBP/EBP/R13/R13D +ce: BSWAP RSI/ESI/R14/R14D +cf: BSWAP RDI/EDI/R15/R15D +# 0x0f 0xd0-0xdf +d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX) +d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128) +d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128) +d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128) +d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128) +d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128) +d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128) +d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128) +d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128) +da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128) +db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128) +dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128) +dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128) +de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128) +df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128) +# 0x0f 0xe0-0xef +e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128) +e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128) +e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128) +e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128) +e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128) +e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128) +e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX) +e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX) +e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128) +e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128) +ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128) +eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128) +ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128) +ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128) +ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128) +ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128) +# 0x0f 0xf0-0xff +f0: lddqu Vdq,Mdq (F2),(VEX) +f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128) +f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128) +f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128) +f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128) +f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128) +f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128) +f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128) +f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128) +f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128) +fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128) +fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128) +fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128) +fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128) +fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128) +ff: +EndTable + +Table: 3-byte opcode 1 (0x0f 0x38) +Referrer: 3-byte escape 1 +AVXcode: 2 +# 0x0f 0x38 0x00-0x0f +00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128) +01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128) +02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128) +03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128) +04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128) +05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128) +06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128) +07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128) +08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128) +09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128) +0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128) +0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128) +0c: Vpermilps /r (66),(oVEX) +0d: Vpermilpd /r (66),(oVEX) +0e: vtestps /r (66),(oVEX) +0f: vtestpd /r (66),(oVEX) +# 0x0f 0x38 0x10-0x1f +10: pblendvb Vdq,Wdq (66) +11: +12: +13: +14: blendvps Vdq,Wdq (66) +15: blendvpd Vdq,Wdq (66) +16: +17: ptest Vdq,Wdq (66),(VEX) +18: vbroadcastss /r (66),(oVEX) +19: vbroadcastsd /r (66),(oVEX),(o256) +1a: vbroadcastf128 /r (66),(oVEX),(o256) +1b: +1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128) +1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128) +1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128) +1f: +# 0x0f 0x38 0x20-0x2f +20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128) +21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128) +22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128) +23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128) +24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128) +25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128) +26: +27: +28: pmuldq Vdq,Wdq (66),(VEX),(o128) +29: pcmpeqq Vdq,Wdq (66),(VEX),(o128) +2a: movntdqa Vdq,Mdq (66),(VEX),(o128) +2b: packusdw Vdq,Wdq (66),(VEX),(o128) +2c: vmaskmovps(ld) /r (66),(oVEX) +2d: vmaskmovpd(ld) /r (66),(oVEX) +2e: vmaskmovps(st) /r (66),(oVEX) +2f: vmaskmovpd(st) /r (66),(oVEX) +# 0x0f 0x38 0x30-0x3f +30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128) +31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128) +32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128) +33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128) +34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128) +35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128) +36: +37: pcmpgtq Vdq,Wdq (66),(VEX),(o128) +38: pminsb Vdq,Wdq (66),(VEX),(o128) +39: pminsd Vdq,Wdq (66),(VEX),(o128) +3a: pminuw Vdq,Wdq (66),(VEX),(o128) +3b: pminud Vdq,Wdq (66),(VEX),(o128) +3c: pmaxsb Vdq,Wdq (66),(VEX),(o128) +3d: pmaxsd Vdq,Wdq (66),(VEX),(o128) +3e: pmaxuw Vdq,Wdq (66),(VEX),(o128) +3f: pmaxud Vdq,Wdq (66),(VEX),(o128) +# 0x0f 0x38 0x40-0x8f +40: pmulld Vdq,Wdq (66),(VEX),(o128) +41: phminposuw Vdq,Wdq (66),(VEX),(o128) +80: INVEPT Gd/q,Mdq (66) +81: INVPID Gd/q,Mdq (66) +# 0x0f 0x38 0x90-0xbf (FMA) +96: vfmaddsub132pd/ps /r (66),(VEX) +97: vfmsubadd132pd/ps /r (66),(VEX) +98: vfmadd132pd/ps /r (66),(VEX) +99: vfmadd132sd/ss /r (66),(VEX),(o128) +9a: vfmsub132pd/ps /r (66),(VEX) +9b: vfmsub132sd/ss /r (66),(VEX),(o128) +9c: vfnmadd132pd/ps /r (66),(VEX) +9d: vfnmadd132sd/ss /r (66),(VEX),(o128) +9e: vfnmsub132pd/ps /r (66),(VEX) +9f: vfnmsub132sd/ss /r (66),(VEX),(o128) +a6: vfmaddsub213pd/ps /r (66),(VEX) +a7: vfmsubadd213pd/ps /r (66),(VEX) +a8: vfmadd213pd/ps /r (66),(VEX) +a9: vfmadd213sd/ss /r (66),(VEX),(o128) +aa: vfmsub213pd/ps /r (66),(VEX) +ab: vfmsub213sd/ss /r (66),(VEX),(o128) +ac: vfnmadd213pd/ps /r (66),(VEX) +ad: vfnmadd213sd/ss /r (66),(VEX),(o128) +ae: vfnmsub213pd/ps /r (66),(VEX) +af: vfnmsub213sd/ss /r (66),(VEX),(o128) +b6: vfmaddsub231pd/ps /r (66),(VEX) +b7: vfmsubadd231pd/ps /r (66),(VEX) +b8: vfmadd231pd/ps /r (66),(VEX) +b9: vfmadd231sd/ss /r (66),(VEX),(o128) +ba: vfmsub231pd/ps /r (66),(VEX) +bb: vfmsub231sd/ss /r (66),(VEX),(o128) +bc: vfnmadd231pd/ps /r (66),(VEX) +bd: vfnmadd231sd/ss /r (66),(VEX),(o128) +be: vfnmsub231pd/ps /r (66),(VEX) +bf: vfnmsub231sd/ss /r (66),(VEX),(o128) +# 0x0f 0x38 0xc0-0xff +db: aesimc Vdq,Wdq (66),(VEX),(o128) +dc: aesenc Vdq,Wdq (66),(VEX),(o128) +dd: aesenclast Vdq,Wdq (66),(VEX),(o128) +de: aesdec Vdq,Wdq (66),(VEX),(o128) +df: aesdeclast Vdq,Wdq (66),(VEX),(o128) +f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) +f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) +EndTable + +Table: 3-byte opcode 2 (0x0f 0x3a) +Referrer: 3-byte escape 2 +AVXcode: 3 +# 0x0f 0x3a 0x00-0xff +04: vpermilps /r,Ib (66),(oVEX) +05: vpermilpd /r,Ib (66),(oVEX) +06: vperm2f128 /r,Ib (66),(oVEX),(o256) +08: roundps Vdq,Wdq,Ib (66),(VEX) +09: roundpd Vdq,Wdq,Ib (66),(VEX) +0a: roundss Vss,Wss,Ib (66),(VEX),(o128) +0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128) +0c: blendps Vdq,Wdq,Ib (66),(VEX) +0d: blendpd Vdq,Wdq,Ib (66),(VEX) +0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128) +0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128) +14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128) +15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128) +16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128) +17: extractps Ed,Vdq,Ib (66),(VEX),(o128) +18: vinsertf128 /r,Ib (66),(oVEX),(o256) +19: vextractf128 /r,Ib (66),(oVEX),(o256) +20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128) +21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128) +22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128) +40: dpps Vdq,Wdq,Ib (66),(VEX) +41: dppd Vdq,Wdq,Ib (66),(VEX),(o128) +42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128) +44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128) +4a: vblendvps /r,Ib (66),(oVEX) +4b: vblendvpd /r,Ib (66),(oVEX) +4c: vpblendvb /r,Ib (66),(oVEX),(o128) +60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128) +61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128) +62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128) +63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128) +df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128) +EndTable + +GrpTable: Grp1 +0: ADD +1: OR +2: ADC +3: SBB +4: AND +5: SUB +6: XOR +7: CMP +EndTable + +GrpTable: Grp1A +0: POP +EndTable + +GrpTable: Grp2 +0: ROL +1: ROR +2: RCL +3: RCR +4: SHL/SAL +5: SHR +6: +7: SAR +EndTable + +GrpTable: Grp3_1 +0: TEST Eb,Ib +1: +2: NOT Eb +3: NEG Eb +4: MUL AL,Eb +5: IMUL AL,Eb +6: DIV AL,Eb +7: IDIV AL,Eb +EndTable + +GrpTable: Grp3_2 +0: TEST Ev,Iz +1: +2: NOT Ev +3: NEG Ev +4: MUL rAX,Ev +5: IMUL rAX,Ev +6: DIV rAX,Ev +7: IDIV rAX,Ev +EndTable + +GrpTable: Grp4 +0: INC Eb +1: DEC Eb +EndTable + +GrpTable: Grp5 +0: INC Ev +1: DEC Ev +2: CALLN Ev (f64) +3: CALLF Ep +4: JMPN Ev (f64) +5: JMPF Ep +6: PUSH Ev (d64) +7: +EndTable + +GrpTable: Grp6 +0: SLDT Rv/Mw +1: STR Rv/Mw +2: LLDT Ew +3: LTR Ew +4: VERR Ew +5: VERW Ew +EndTable + +GrpTable: Grp7 +0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) +1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) +3: LIDT Ms +4: SMSW Mw/Rv +5: +6: LMSW Ew +7: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B) +EndTable + +GrpTable: Grp8 +4: BT +5: BTS +6: BTR +7: BTC +EndTable + +GrpTable: Grp9 +1: CMPXCHG8B/16B Mq/Mdq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) +7: VMPTRST Mq +EndTable + +GrpTable: Grp10 +EndTable + +GrpTable: Grp11 +0: MOV +EndTable + +GrpTable: Grp12 +2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128) +4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128) +6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128) +EndTable + +GrpTable: Grp13 +2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128) +4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128) +6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128) +EndTable + +GrpTable: Grp14 +2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128) +3: psrldq Udq,Ib (66),(11B),(VEX),(o128) +6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128) +7: pslldq Udq,Ib (66),(11B),(VEX),(o128) +EndTable + +GrpTable: Grp15 +0: fxsave +1: fxstor +2: ldmxcsr (VEX) +3: stmxcsr (VEX) +4: XSAVE +5: XRSTOR | lfence (11B) +6: mfence (11B) +7: clflush | sfence (11B) +EndTable + +GrpTable: Grp16 +0: prefetch NTA +1: prefetch T0 +2: prefetch T1 +3: prefetch T2 +EndTable + +# AMD's Prefetch Group +GrpTable: GrpP +0: PREFETCH +1: PREFETCHW +EndTable + +GrpTable: GrpPDLK +0: MONTMUL +1: XSHA1 +2: XSHA2 +EndTable + +GrpTable: GrpRNG +0: xstore-rng +1: xcrypt-ecb +2: xcrypt-cbc +4: xcrypt-cfb +5: xcrypt-ofb +EndTable diff -uprN linux-2.6.32/arch/x86/Makefile linux-2.6.32.ovz/arch/x86/Makefile --- linux-2.6.32/arch/x86/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/Makefile 2015-03-10 13:24:00.000000000 -0700 @@ -95,8 +95,12 @@ sp-$(CONFIG_X86_64) := rsp cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) # is .cfi_signal_frame supported too? cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) -KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) -KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) +cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) + +avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) + +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(avx_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(avx_instr) LDFLAGS := -m elf_$(UTS_MACHINE) @@ -135,9 +139,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/o # suspend and hibernation support drivers-$(CONFIG_PM) += arch/x86/power/ -ifeq ($(CONFIG_X86_32),y) drivers-$(CONFIG_FB) += arch/x86/video/ -endif #### # boot loader support. Several targets are kept for legacy purposes @@ -155,6 +157,9 @@ all: bzImage KBUILD_IMAGE := $(boot)/bzImage bzImage: vmlinux +ifeq ($(CONFIG_X86_DECODER_SELFTEST),y) + $(Q)$(MAKE) $(build)=arch/x86/tools posttest +endif $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ diff -uprN linux-2.6.32/arch/x86/Makefile_32.cpu linux-2.6.32.ovz/arch/x86/Makefile_32.cpu --- linux-2.6.32/arch/x86/Makefile_32.cpu 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/Makefile_32.cpu 2015-03-10 13:21:04.000000000 -0700 @@ -46,6 +46,13 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pen # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) +# Work around the pentium-mmx code generator madness of gcc4.4.x which +# does stack alignment by generating horrible code _before_ the mcount +# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph +# tracer assumptions. For i686, generic, core2 this is set by the +# compiler anyway +cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args) + # Bug fix for binutils: this option is required in order to keep # binutils from generating NOPL instructions against our will. ifneq ($(CONFIG_X86_P6_NOP),y) diff -uprN linux-2.6.32/arch/x86/math-emu/fpu_aux.c linux-2.6.32.ovz/arch/x86/math-emu/fpu_aux.c --- linux-2.6.32/arch/x86/math-emu/fpu_aux.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/math-emu/fpu_aux.c 2015-03-10 13:22:12.000000000 -0700 @@ -30,10 +30,10 @@ static void fclex(void) } /* Needs to be externally visible */ -void finit_task(struct task_struct *tsk) +void finit_soft_fpu(struct i387_soft_struct *soft) { - struct i387_soft_struct *soft = &tsk->thread.xstate->soft; struct address *oaddr, *iaddr; + memset(soft, 0, sizeof(*soft)); soft->cwd = 0x037f; soft->swd = 0; soft->ftop = 0; /* We don't keep top in the status word internally. */ @@ -52,7 +52,7 @@ void finit_task(struct task_struct *tsk) void finit(void) { - finit_task(current); + finit_task(¤t->thread.fpu); } /* diff -uprN linux-2.6.32/arch/x86/mm/amdtopology_64.c linux-2.6.32.ovz/arch/x86/mm/amdtopology_64.c --- linux-2.6.32/arch/x86/mm/amdtopology_64.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/amdtopology_64.c 2015-03-10 13:22:22.000000000 -0700 @@ -0,0 +1,222 @@ +/* + * AMD NUMA support. + * Discover the memory map and associated nodes. + * + * This version reads it directly from the AMD northbridge. + * + * Copyright 2002,2003 Andi Kleen, SuSE Labs. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static __init int find_northbridge(void) +{ + int num; + + for (num = 0; num < 32; num++) { + u32 header; + + header = read_pci_config(0, num, 0, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) + continue; + + header = read_pci_config(0, num, 1, 0x00); + if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && + header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) + continue; + return num; + } + + return -1; +} + +static __init void early_get_boot_cpu_id(void) +{ + /* + * need to get boot_cpu_id so can use that to create apicid_to_node + * in amd_scan_nodes() + */ + /* + * Find possible boot-time SMP configuration: + */ +#ifdef CONFIG_X86_MPPARSE + early_find_smp_config(); +#endif +#ifdef CONFIG_ACPI + /* + * Read APIC information from ACPI tables. + */ + early_acpi_boot_init(); +#endif +#ifdef CONFIG_X86_MPPARSE + /* + * get boot-time SMP configuration: + */ + if (smp_found_config) + early_get_smp_config(); +#endif + early_init_lapic_mapping(); +} + +int __init amd_scan_nodes(unsigned long start, unsigned long end) +{ + unsigned numnodes, cores, bits, apicid_base; + unsigned long prevbase; + struct bootnode nodes[8]; + int i, j, nb, found = 0; + u32 nodeid, reg; + + if (!early_pci_allowed()) + return -1; + + nb = find_northbridge(); + if (nb < 0) + return nb; + + printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); + + reg = read_pci_config(0, nb, 0, 0x60); + numnodes = ((reg >> 4) & 0xF) + 1; + if (numnodes <= 1) + return -1; + + printk(KERN_INFO "Number of nodes %d\n", numnodes); + + memset(&nodes, 0, sizeof(nodes)); + prevbase = 0; + for (i = 0; i < 8; i++) { + unsigned long base, limit; + + base = read_pci_config(0, nb, 1, 0x40 + i*8); + limit = read_pci_config(0, nb, 1, 0x44 + i*8); + + nodeid = limit & 7; + if ((base & 3) == 0) { + if (i < numnodes) + printk("Skipping disabled node %d\n", i); + continue; + } + if (nodeid >= numnodes) { + printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, + base, limit); + continue; + } + + if (!limit) { + printk(KERN_INFO "Skipping node entry %d (base %lx)\n", + i, base); + continue; + } + if ((base >> 8) & 3 || (limit >> 8) & 3) { + printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", + nodeid, (base>>8)&3, (limit>>8) & 3); + return -1; + } + if (node_isset(nodeid, node_possible_map)) { + printk(KERN_INFO "Node %d already present. Skipping\n", + nodeid); + continue; + } + + limit >>= 16; + limit <<= 24; + limit |= (1<<24)-1; + limit++; + + if (limit > max_pfn << PAGE_SHIFT) + limit = max_pfn << PAGE_SHIFT; + if (limit <= base) + continue; + + base >>= 16; + base <<= 24; + + if (base < start) + base = start; + if (limit > end) + limit = end; + if (limit == base) { + printk(KERN_ERR "Empty node %d\n", nodeid); + continue; + } + if (limit < base) { + printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", + nodeid, base, limit); + continue; + } + + /* Could sort here, but pun for now. Should not happen anyroads. */ + if (prevbase > base) { + printk(KERN_ERR "Node map not sorted %lx,%lx\n", + prevbase, base); + return -1; + } + + printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", + nodeid, base, limit); + + found++; + + nodes[nodeid].start = base; + nodes[nodeid].end = limit; + + prevbase = base; + + node_set(nodeid, node_possible_map); + } + + if (!found) + return -1; + + memnode_shift = compute_hash_shift(nodes, 8, NULL); + if (memnode_shift < 0) { + printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); + return -1; + } + printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); + + /* use the coreid bits from early_identify_cpu */ + bits = boot_cpu_data.x86_coreid_bits; + cores = (1< 0) { + printk(KERN_INFO "BSP APIC ID: %02x\n", + boot_cpu_physical_apicid); + apicid_base = boot_cpu_physical_apicid; + } + + for (i = 0; i < 8; i++) { + if (nodes[i].start == nodes[i].end) + continue; + + e820_register_active_regions(i, + nodes[i].start >> PAGE_SHIFT, + nodes[i].end >> PAGE_SHIFT); + for (j = apicid_base; j < cores + apicid_base; j++) + apicid_to_node[(i << bits) + j] = i; + setup_node_bootmem(i, nodes[i].start, nodes[i].end); + } + + numa_init_array(); + return 0; +} diff -uprN linux-2.6.32/arch/x86/mm/fault.c linux-2.6.32.ovz/arch/x86/mm/fault.c --- linux-2.6.32/arch/x86/mm/fault.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/fault.c 2015-06-23 04:16:16.000000000 -0700 @@ -11,11 +11,15 @@ #include /* __kprobes, ... */ #include /* kmmio_handler, ... */ #include /* perf_sw_event */ +#include /* hstate_index_to_shift */ +#include #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ #include /* kmemcheck_*(), ... */ +#include + /* * Page fault error code bits: * @@ -159,15 +163,20 @@ is_prefetch(struct pt_regs *regs, unsign static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk) + struct task_struct *tsk, int fault) { + unsigned lsb = 0; siginfo_t info; info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; force_sig_info(si_signo, &info, tsk); } @@ -222,16 +231,24 @@ void vmalloc_sync_all(void) for (address = VMALLOC_START & PMD_MASK; address >= TASK_SIZE && address < FIXADDR_TOP; address += PMD_SIZE) { - - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { - if (!vmalloc_sync_one(page_address(page), address)) + spinlock_t *pgt_lock; + pmd_t *ret; + + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + + spin_lock(pgt_lock); + ret = vmalloc_sync_one(page_address(page), address); + spin_unlock(pgt_lock); + + if (!ret) break; } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } } @@ -331,22 +348,30 @@ void vmalloc_sync_all(void) address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); - unsigned long flags; struct page *page; if (pgd_none(*pgd_ref)) continue; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + spinlock_t *pgt_lock; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + if (pgd_none(*pgd)) set_pgd(pgd, *pgd_ref); else BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + + spin_unlock(pgt_lock); } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } } @@ -658,8 +683,8 @@ no_context(struct pt_regs *regs, unsigne show_fault_oops(regs, error_code, address); stackend = end_of_stack(tsk); - if (*stackend != STACK_END_MAGIC) - printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + if (tsk != &init_task && *stackend != STACK_END_MAGIC) + printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); tsk->thread.cr2 = address; tsk->thread.trap_no = 14; @@ -670,7 +695,7 @@ no_context(struct pt_regs *regs, unsigne sig = 0; /* Executive summary in case the body of the oops scrolled away */ - printk(KERN_EMERG "CR2: %016lx\n", address); + printk(KERN_DEFAULT "CR2: %016lx\n", address); oops_end(flags, regs, sig); } @@ -689,7 +714,7 @@ show_signal_msg(struct pt_regs *regs, un if (!printk_ratelimit()) return; - printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx", + ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx", task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG, tsk->comm, task_pid_nr(tsk), address, (void *)regs->ip, (void *)regs->sp, error_code); @@ -730,7 +755,7 @@ __bad_area_nosemaphore(struct pt_regs *r tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); return; } @@ -776,6 +801,7 @@ bad_area_access_error(struct pt_regs *re __bad_area(regs, error_code, address, SEGV_ACCERR); } +#if 0 /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */ static void out_of_memory(struct pt_regs *regs, unsigned long error_code, @@ -787,8 +813,9 @@ out_of_memory(struct pt_regs *regs, unsi */ up_read(¤t->mm->mmap_sem); - pagefault_out_of_memory(); + out_of_memory_in_ub(get_exec_ub(), 0); } +#endif static void do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, @@ -801,8 +828,10 @@ do_sigbus(struct pt_regs *regs, unsigned up_read(&mm->mmap_sem); /* Kernel mode? Handle exceptions or die: */ - if (!(error_code & PF_USER)) + if (!(error_code & PF_USER)) { no_context(regs, error_code, address); + return; + } /* User-space => ok to do another page fault: */ if (is_prefetch(regs, error_code, address)) @@ -813,24 +842,45 @@ do_sigbus(struct pt_regs *regs, unsigned tsk->thread.trap_no = 14; #ifdef CONFIG_MEMORY_FAILURE - if (fault & VM_FAULT_HWPOISON) { + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { printk(KERN_ERR "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk); + force_sig_info_fault(SIGBUS, code, address, tsk, fault); } static noinline void mm_fault_error(struct pt_regs *regs, unsigned long error_code, unsigned long address, unsigned int fault) { + if (fatal_signal_pending(current) && !(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address); + return; + } + if (fault & VM_FAULT_OOM) { - out_of_memory(regs, error_code, address); + /* Kernel mode? Handle exceptions or die: */ + if (!(error_code & PF_USER)) { + up_read(¤t->mm->mmap_sem); + no_context(regs, error_code, address); + return; + } + + /* + * This fault can be caused by two different reasons: + * 1) Buddy allocator failed to alloc a page for pud and friends. + * 2) OOM-killed failed to provide us requred memory. + * Current task can't execute in such circumstances. + */ + up_read(¤t->mm->mmap_sem); + send_sig(SIGKILL, current, 0); } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else BUG(); @@ -909,7 +959,7 @@ spurious_fault(unsigned long error_code, return ret; } -int show_unhandled_signals = 1; +int show_unhandled_signals = 0; static inline int access_error(unsigned long error_code, int write, struct vm_area_struct *vma) @@ -937,27 +987,18 @@ static int fault_in_kernel_space(unsigne return address >= TASK_SIZE_MAX; } -/* - * This routine handles page faults. It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. - */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +static inline void __do_page_fault(struct pt_regs *regs, unsigned long address, unsigned long error_code) { struct vm_area_struct *vma; struct task_struct *tsk; - unsigned long address; struct mm_struct *mm; - int write; int fault; + int write = error_code & PF_WRITE; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; tsk = current; mm = tsk->mm; - /* Get the faulting address: */ - address = read_cr2(); - /* * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. @@ -1020,6 +1061,7 @@ do_page_fault(struct pt_regs *regs, unsi if (user_mode_vm(regs)) { local_irq_enable(); error_code |= PF_USER; + flags |= FAULT_FLAG_USER; } else { if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); @@ -1028,7 +1070,7 @@ do_page_fault(struct pt_regs *regs, unsi if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1039,6 +1081,9 @@ do_page_fault(struct pt_regs *regs, unsi return; } + if (error_code & PF_WRITE) + flags |= FAULT_FLAG_WRITE; + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in @@ -1061,6 +1106,7 @@ do_page_fault(struct pt_regs *regs, unsi bad_area_nosemaphore(regs, error_code, address); return; } +retry: down_read(&mm->mmap_sem); } else { /* @@ -1104,8 +1150,6 @@ do_page_fault(struct pt_regs *regs, unsi * we can handle it.. */ good_area: - write = error_code & PF_WRITE; - if (unlikely(access_error(error_code, write, vma))) { bad_area_access_error(regs, error_code, address); return; @@ -1116,24 +1160,64 @@ good_area: * make sure we exit gracefully rather than endlessly redo * the fault: */ - fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); + fault = handle_mm_fault(mm, vma, address, flags); + + /* + * If we need to retry but a fatal signal is pending, handle the + * signal first. We do not need to release the mmap_sem because it + * would already be released in __lock_page_or_retry in mm/filemap.c. + */ + if (unlikely((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))) + return; if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, fault); return; } - if (fault & VM_FAULT_MAJOR) { - tsk->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, - regs, address); - } else { - tsk->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, - regs, address); + /* + * Major/minor page fault accounting is only done on the + * initial attempt. If we go through a retry, it is extremely + * likely that the page will be found in page cache at that point. + */ + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) { + tsk->maj_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, + regs, address); + } else { + tsk->min_flt++; + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, + regs, address); + } + if (fault & VM_FAULT_RETRY) { + /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk + * of starvation. */ + flags &= ~FAULT_FLAG_ALLOW_RETRY; + goto retry; + } } check_v8086_mode(regs, address, tsk); up_read(&mm->mmap_sem); } + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ + unsigned long address; + + /* Get the faulting address: */ + address = read_cr2(); + + __do_page_fault(regs, address, error_code); + + if (!user_mode(regs)) + trace_mm_kernel_pagefault(current, address, regs); +} diff -uprN linux-2.6.32/arch/x86/mm/gup.c linux-2.6.32.ovz/arch/x86/mm/gup.c --- linux-2.6.32/arch/x86/mm/gup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/gup.c 2015-03-10 13:23:18.000000000 -0700 @@ -128,6 +128,8 @@ static noinline int gup_huge_pmd(pmd_t p do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -148,7 +150,18 @@ static int gup_pmd_range(pud_t pud, unsi pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) + /* + * The pmd_trans_splitting() check below explains why + * pmdp_splitting_flush has to flush the tlb, to stop + * this gup-fast code from running while we set the + * splitting bit in the pmd. Returning zero will take + * the slow path that will call wait_split_huge_page() + * if the pmd is still in splitting state. gup-fast + * can't because it has irq disabled and + * wait_split_huge_page() would never return as the + * tlb flush IPI wouldn't run. + */ + if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) @@ -185,6 +198,8 @@ static noinline int gup_huge_pud(pud_t p do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; diff -uprN linux-2.6.32/arch/x86/mm/highmem_32.c linux-2.6.32.ovz/arch/x86/mm/highmem_32.c --- linux-2.6.32/arch/x86/mm/highmem_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/highmem_32.c 2015-03-10 13:22:47.000000000 -0700 @@ -44,6 +44,7 @@ void *kmap_atomic_prot(struct page *page vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); + arch_flush_lazy_mmu_mode(); return (void *)vaddr; } @@ -64,9 +65,10 @@ void kunmap_atomic(void *kvaddr, enum km * also, in case the page changes cacheability attributes or becomes * a protected page in a hypervisor. */ - if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) + if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) { kpte_clear_flush(kmap_pte-idx, vaddr); - else { + arch_flush_lazy_mmu_mode(); + } else { #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(vaddr < PAGE_OFFSET); BUG_ON(vaddr >= (unsigned long)high_memory); diff -uprN linux-2.6.32/arch/x86/mm/hugetlbpage.c linux-2.6.32.ovz/arch/x86/mm/hugetlbpage.c --- linux-2.6.32/arch/x86/mm/hugetlbpage.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/hugetlbpage.c 2015-06-23 04:16:16.000000000 -0700 @@ -59,7 +59,8 @@ static int vma_shareable(struct vm_area_ /* * search for a shareable pmd page for hugetlb. */ -static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) +static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud, + bool *shared) { struct vm_area_struct *vma = find_vma(mm, addr); struct address_space *mapping = vma->vm_file->f_mapping; @@ -92,9 +93,10 @@ static void huge_pmd_share(struct mm_str goto out; spin_lock(&mm->page_table_lock); - if (pud_none(*pud)) + if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); - else + *shared = true; + } else put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: @@ -129,7 +131,8 @@ int huge_pmd_unshare(struct mm_struct *m } pte_t *huge_pte_alloc(struct mm_struct *mm, - unsigned long addr, unsigned long sz) + unsigned long addr, unsigned long sz, + bool *shared) { pgd_t *pgd; pud_t *pud; @@ -143,7 +146,7 @@ pte_t *huge_pte_alloc(struct mm_struct * } else { BUG_ON(sz != PMD_SIZE); if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); + huge_pmd_share(mm, addr, pud, shared); pte = (pte_t *) pmd_alloc(mm, pud, addr); } } @@ -225,6 +228,7 @@ int pmd_huge(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_PSE); } +EXPORT_SYMBOL(pmd_huge); int pud_huge(pud_t pud) { @@ -268,8 +272,9 @@ static unsigned long hugetlb_get_unmappe struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long start_addr; + unsigned int unmap_factor = sysctl_unmap_area_factor; - if (len > mm->cached_hole_size) { + if (len > mm->cached_hole_size || unmap_factor) { start_addr = mm->free_area_cache; } else { start_addr = TASK_UNMAPPED_BASE; @@ -288,7 +293,8 @@ full_search: */ if (start_addr != TASK_UNMAPPED_BASE) { start_addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; + if (likely(!unmap_factor)) + mm->cached_hole_size = 0; goto full_search; } return -ENOMEM; @@ -297,8 +303,11 @@ full_search: mm->free_area_cache = addr + len; return addr; } - if (addr + mm->cached_hole_size < vma->vm_start) + + if (!unmap_factor && + addr + mm->cached_hole_size < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; + addr = ALIGN(vma->vm_end, huge_page_size(h)); } } @@ -312,14 +321,15 @@ static unsigned long hugetlb_get_unmappe struct vm_area_struct *vma, *prev_vma; unsigned long base = mm->mmap_base, addr = addr0; unsigned long largest_hole = mm->cached_hole_size; + unsigned int unmap_factor = sysctl_unmap_area_factor; int first_time = 1; /* don't allow allocations above current base */ if (mm->free_area_cache > base) mm->free_area_cache = base; - if (len <= largest_hole) { - largest_hole = 0; + if (len <= largest_hole && !unmap_factor) { + largest_hole = 0; mm->free_area_cache = base; } try_again: @@ -344,18 +354,20 @@ try_again: if (addr + len <= vma->vm_start && (!prev_vma || (addr >= prev_vma->vm_end))) { /* remember the address as a hint for next time */ - mm->cached_hole_size = largest_hole; + if (likely(!unmap_factor)) + mm->cached_hole_size = largest_hole; return (mm->free_area_cache = addr); } else { /* pull free_area_cache down to the first hole */ if (mm->free_area_cache == vma->vm_end) { mm->free_area_cache = vma->vm_start; - mm->cached_hole_size = largest_hole; + if (likely(!unmap_factor)) + mm->cached_hole_size = largest_hole; } } /* remember the largest hole we saw so far */ - if (addr + largest_hole < vma->vm_start) + if (addr + largest_hole < vma->vm_start && !unmap_factor) largest_hole = vma->vm_start - addr; /* try just below the current vma->vm_start */ @@ -369,7 +381,8 @@ fail: */ if (first_time) { mm->free_area_cache = base; - largest_hole = 0; + if (likely(!unmap_factor)) + largest_hole = 0; first_time = 0; goto try_again; } @@ -380,7 +393,8 @@ fail: * allocations. */ mm->free_area_cache = TASK_UNMAPPED_BASE; - mm->cached_hole_size = ~0UL; + if (likely(!unmap_factor)) + mm->cached_hole_size = ~0UL; addr = hugetlb_get_unmapped_area_bottomup(file, addr0, len, pgoff, flags); @@ -388,7 +402,8 @@ fail: * Restore the topdown base: */ mm->free_area_cache = base; - mm->cached_hole_size = ~0UL; + if (likely(!unmap_factor)) + mm->cached_hole_size = ~0UL; return addr; } diff -uprN linux-2.6.32/arch/x86/mm/init_32.c linux-2.6.32.ovz/arch/x86/mm/init_32.c --- linux-2.6.32/arch/x86/mm/init_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/init_32.c 2015-03-10 13:22:31.000000000 -0700 @@ -31,6 +31,10 @@ #include #include +#ifdef CONFIG_XEN +#include +#endif + #include #include #include @@ -444,6 +448,13 @@ static int __init add_highpages_work_fn( node_pfn++) { if (!pfn_valid(node_pfn)) continue; +#ifdef CONFIG_XEN + /* Do not initialize and ignore not present at boot time pfn */ + if (node_pfn >= PFN_DOWN(xen_extra_mem_start) && + node_pfn < PFN_DOWN(xen_extra_mem_start + + xen_extra_mem_size)) + continue; +#endif /* CONFIG_XEN */ page = pfn_to_page(node_pfn); add_one_highpage_init(page, node_pfn); } @@ -997,7 +1008,7 @@ static noinline int do_test_wp_bit(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) { diff -uprN linux-2.6.32/arch/x86/mm/init_64.c linux-2.6.32.ovz/arch/x86/mm/init_64.c --- linux-2.6.32/arch/x86/mm/init_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/init_64.c 2015-06-23 04:16:16.000000000 -0700 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -49,6 +50,7 @@ #include #include #include +#include static unsigned long dma_reserve __initdata; @@ -95,6 +97,43 @@ static int __init nonx32_setup(char *str } __setup("noexec32=", nonx32_setup); + +/* + * When memory was added/removed make sure all the processes MM have + * suitable PGD entries in the local PGD level page. + * Caller must flush global TLBs if needed (old mapping changed) + */ +static void sync_global_pgds(unsigned long start, unsigned long end) +{ + struct page *page; + unsigned long addr; + + spin_lock(&pgd_lock); + for (addr = start; addr < end; addr += PGDIR_SIZE) { + pgd_t *ref_pgd = pgd_offset_k(addr); + list_for_each_entry(page, &pgd_list, lru) { + spinlock_t *pgt_lock; + pgd_t *pgd_base = page_address(page); + pgd_t *pgd = pgd_base + pgd_index(addr); + + /* the pgt_lock only for Xen */ + pgt_lock = &pgd_page_get_mm(page)->page_table_lock; + spin_lock(pgt_lock); + + /* + * When the state is the same in one other, + * assume it's the same everywhere. + */ + if (pgd_base != init_mm.pgd && + !!pgd_none(*pgd) != !!pgd_none(*ref_pgd)) + set_pgd(pgd, *ref_pgd); + + spin_unlock(pgt_lock); + } + } + spin_unlock(&pgd_lock); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. @@ -216,6 +255,9 @@ static void __init __init_extra_mapping( pgd_t *pgd; pud_t *pud; pmd_t *pmd; + int pgd_changed = 0; + unsigned long start = (unsigned long)__va(phys); + unsigned long end = (unsigned long)__va(phys + size); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { @@ -224,6 +266,7 @@ static void __init __init_extra_mapping( pud = (pud_t *) spp_getpage(); set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | _PAGE_USER)); + pgd_changed = 1; } pud = pud_offset(pgd, (unsigned long)__va(phys)); if (pud_none(*pud)) { @@ -235,6 +278,11 @@ static void __init __init_extra_mapping( BUG_ON(!pmd_none(*pmd)); set_pmd(pmd, __pmd(phys | pgprot_val(prot))); } + if (pgd_changed) { + sync_global_pgds(start, end); + /* Might not be needed if the previous mapping was always zero*/ + __flush_tlb_all(); + } } void __init init_extra_mapping_wb(unsigned long phys, unsigned long size) @@ -532,36 +580,41 @@ kernel_physical_mapping_init(unsigned lo unsigned long end, unsigned long page_size_mask) { - + int pgd_changed = 0; unsigned long next, last_map_addr = end; + unsigned long addr; start = (unsigned long)__va(start); end = (unsigned long)__va(end); - for (; start < end; start = next) { - pgd_t *pgd = pgd_offset_k(start); + for (addr = start; addr < end; addr = next) { + pgd_t *pgd = pgd_offset_k(addr); unsigned long pud_phys; pud_t *pud; - next = (start + PGDIR_SIZE) & PGDIR_MASK; + next = (addr + PGDIR_SIZE) & PGDIR_MASK; if (next > end) next = end; if (pgd_val(*pgd)) { - last_map_addr = phys_pud_update(pgd, __pa(start), + last_map_addr = phys_pud_update(pgd, __pa(addr), __pa(end), page_size_mask); continue; } pud = alloc_low_page(&pud_phys); - last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), + last_map_addr = phys_pud_init(pud, __pa(addr), __pa(next), page_size_mask); unmap_low_page(pud); spin_lock(&init_mm.page_table_lock); pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); + pgd_changed = 1; } + + if (pgd_changed) + sync_global_pgds(start, end); __flush_tlb_all(); return last_map_addr; @@ -615,6 +668,21 @@ void __init paging_init(void) */ #ifdef CONFIG_MEMORY_HOTPLUG /* + * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need + * updating. + */ +static void update_end_of_memory_vars(u64 start, u64 size) +{ + unsigned long end_pfn = PFN_UP(start + size); + + if (end_pfn > max_pfn) { + max_pfn = end_pfn; + max_low_pfn = end_pfn; + high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + } +} + +/* * Memory is added always to NORMAL zone. This means you will never get * additional DMA/DMA32 memory. */ @@ -633,6 +701,9 @@ int arch_add_memory(int nid, u64 start, ret = __add_pages(nid, zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start, size); + return ret; } EXPORT_SYMBOL_GPL(arch_add_memory); @@ -671,6 +742,10 @@ void __init mem_init(void) reservedpages = max_pfn - totalram_pages - absent_pages; after_bootmem = 1; + totalram_pages += pram_reserved_pages; + reservedpages -= pram_reserved_pages; + pram_show_banned(); + codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; @@ -694,12 +769,12 @@ void __init mem_init(void) const int rodata_test_data = 0xC3; EXPORT_SYMBOL_GPL(rodata_test_data); -static int kernel_set_to_readonly; +int kernel_set_to_readonly; void set_kernel_text_rw(void) { - unsigned long start = PFN_ALIGN(_stext); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -707,13 +782,18 @@ void set_kernel_text_rw(void) pr_debug("Set kernel text: %lx - %lx for read write\n", start, end); + /* + * Make the kernel identity mapping for text RW. Kernel text + * mapping will always be RO. Refer to the comment in + * static_protections() in pageattr.c + */ set_memory_rw(start, (end - start) >> PAGE_SHIFT); } void set_kernel_text_ro(void) { - unsigned long start = PFN_ALIGN(_stext); - unsigned long end = PFN_ALIGN(__start_rodata); + unsigned long start = PFN_ALIGN(_text); + unsigned long end = PFN_ALIGN(__stop___ex_table); if (!kernel_set_to_readonly) return; @@ -721,14 +801,21 @@ void set_kernel_text_ro(void) pr_debug("Set kernel text: %lx - %lx for read only\n", start, end); + /* + * Set the kernel identity mapping for text RO. + */ set_memory_ro(start, (end - start) >> PAGE_SHIFT); } void mark_rodata_ro(void) { - unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); + unsigned long start = PFN_ALIGN(_text); unsigned long rodata_start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long end = (unsigned long) &__end_rodata_hpage_align; + unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); + unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); + unsigned long data_start = (unsigned long) &_sdata; printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -751,6 +838,14 @@ void mark_rodata_ro(void) printk(KERN_INFO "Testing CPA: again\n"); set_memory_ro(start, (end-start) >> PAGE_SHIFT); #endif + + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(text_end)), + (unsigned long) + page_address(virt_to_page(rodata_start))); + free_init_pages("unused kernel memory", + (unsigned long) page_address(virt_to_page(rodata_end)), + (unsigned long) page_address(virt_to_page(data_start))); } #endif @@ -846,18 +941,18 @@ static struct vm_area_struct gate_vma = .vm_flags = VM_READ | VM_EXEC }; -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { #ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(tsk, TIF_IA32)) + if (!mm || test_bit(MMF_COMPAT, &mm->flags)) return NULL; #endif return &gate_vma; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = get_gate_vma(task); + struct vm_area_struct *vma = get_gate_vma(mm); if (!vma) return 0; @@ -866,11 +961,11 @@ int in_gate_area(struct task_struct *tas } /* - * Use this when you have no reliable task/vma, typically from interrupt - * context. It is less reliable than using the task's vma and may give - * false positives: + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. */ -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); } @@ -884,6 +979,20 @@ const char *arch_vma_name(struct vm_area return NULL; } +#ifdef CONFIG_X86_UV +#define MIN_MEMORY_BLOCK_SIZE (1 << SECTION_SIZE_BITS) + +u32 memory_block_size_bytes(void) +{ + if (is_uv_system()) { + printk("UV: memory block size 2GB\n"); + return 2UL * 1024 * 1024 * 1024; + } + return MIN_MEMORY_BLOCK_SIZE; +} +#endif + + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. @@ -892,18 +1001,17 @@ static long __meminitdata addr_start, ad static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; -int __meminit -vmemmap_populate(struct page *start_page, unsigned long size, int node) +static int __meminit vmemmap_populate_hugepages(unsigned long start, + unsigned long end, int node) { - unsigned long addr = (unsigned long)start_page; - unsigned long end = (unsigned long)(start_page + size); + unsigned long addr; unsigned long next; pgd_t *pgd; pud_t *pud; pmd_t *pmd; - for (; addr < end; addr = next) { - void *p = NULL; + for (addr = start; addr < end; addr = next) { + next = pmd_addr_end(addr, end); pgd = vmemmap_pgd_populate(addr, node); if (!pgd) @@ -913,31 +1021,14 @@ vmemmap_populate(struct page *start_page if (!pud) return -ENOMEM; - if (!cpu_has_pse) { - next = (addr + PAGE_SIZE) & PAGE_MASK; - pmd = vmemmap_pmd_populate(pud, addr, node); - - if (!pmd) - return -ENOMEM; - - p = vmemmap_pte_populate(pmd, addr, node); + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) { + void *p; - if (!p) - return -ENOMEM; - - addr_end = addr + PAGE_SIZE; - p_end = p + PAGE_SIZE; - } else { - next = pmd_addr_end(addr, end); - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { + p = vmemmap_alloc_block(PMD_SIZE, node); + if (p) { pte_t entry; - p = vmemmap_alloc_block(PMD_SIZE, node); - if (!p) - return -ENOMEM; - entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE); set_pmd(pmd, __pmd(pte_val(entry))); @@ -954,14 +1045,37 @@ vmemmap_populate(struct page *start_page addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; - } else - vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; + } + } else if (pmd_large(*pmd)) { + vmemmap_verify((pte_t *)pmd, node, addr, next); + continue; } - + pr_warn_once("vmemmap: falling back to regular page backing\n"); + if (vmemmap_populate_basepages(addr, next, node)) + return -ENOMEM; } + return 0; } +int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) +{ + int err; + + if (cpu_has_pse) + err = vmemmap_populate_hugepages(start, end, node); + else + err = vmemmap_populate_basepages(start, end, node); + + if (!err) { + sync_global_pgds(start, end); + __flush_tlb_all(); + } + + return err; +} + void __meminit vmemmap_populate_print_last(void) { if (p_start) { diff -uprN linux-2.6.32/arch/x86/mm/init.c linux-2.6.32.ovz/arch/x86/mm/init.c --- linux-2.6.32/arch/x86/mm/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/init.c 2015-03-10 13:24:04.000000000 -0700 @@ -28,35 +28,49 @@ int direct_gbpages #endif ; -static void __init find_early_table_space(unsigned long end, int use_pse, - int use_gbpages) -{ - unsigned long puds, pmds, ptes, tables, start; - - puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); - - if (use_gbpages) { - unsigned long extra; - - extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); - pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; - } else - pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - - tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); +struct map_range { + unsigned long start; + unsigned long end; + unsigned page_size_mask; +}; - if (use_pse) { - unsigned long extra; +/* + * First calculate space needed for kernel direct mapping page tables to cover + * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB + * pages. Then find enough contiguous space for those page tables. + */ +static void __init find_early_table_space(struct map_range *mr, int nr_range) +{ + int i; + unsigned long puds = 0, pmds = 0, ptes = 0, tables; + unsigned long start = 0; + + for (i = 0; i < nr_range; i++) { + unsigned long range, extra; + + range = mr[i].end - mr[i].start; + puds += (range + PUD_SIZE - 1) >> PUD_SHIFT; + + if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) { + extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT); + pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT; + } else { + pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT; + } - extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); + if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) { + extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT); #ifdef CONFIG_X86_32 - extra += PMD_SIZE; + extra += PMD_SIZE; #endif - ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; - } else - ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - + ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; + } else { + ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT; + } + } + + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); #ifdef CONFIG_X86_32 @@ -84,15 +98,10 @@ static void __init find_early_table_spac e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", - end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); + mr[nr_range - 1].end, e820_table_start << PAGE_SHIFT, + e820_table_top << PAGE_SHIFT); } -struct map_range { - unsigned long start; - unsigned long end; - unsigned page_size_mask; -}; - #ifdef CONFIG_X86_32 #define NR_RANGE_MR 3 #else /* CONFIG_X86_64 */ @@ -149,6 +158,12 @@ unsigned long __init_refok init_memory_m set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); +#ifdef CONFIG_X86_32 + else + if (exec_shield) + printk(KERN_INFO "Using x86 segment limits to approximate " + "NX protection\n"); +#endif /* Enable PSE if available */ if (cpu_has_pse) @@ -268,7 +283,7 @@ unsigned long __init_refok init_memory_m * nodes are discovered. */ if (!after_bootmem) - find_early_table_space(end, use_pse, use_gbpages); + find_early_table_space(mr, nr_range); #ifdef CONFIG_X86_32 for (i = 0; i < nr_range; i++) diff -uprN linux-2.6.32/arch/x86/mm/ioremap.c linux-2.6.32.ovz/arch/x86/mm/ioremap.c --- linux-2.6.32/arch/x86/mm/ioremap.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/ioremap.c 2015-06-23 04:16:16.000000000 -0700 @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -24,43 +25,6 @@ #include "physaddr.h" -int page_is_ram(unsigned long pagenr) -{ - resource_size_t addr, end; - int i; - - /* - * A special case is the first 4Kb of memory; - * This is a BIOS owned area, not kernel ram, but generally - * not listed as such in the E820 table. - */ - if (pagenr == 0) - return 0; - - /* - * Second special case: Some BIOSen report the PC BIOS - * area (640->1Mb) as ram even though it is not. - */ - if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) && - pagenr < (BIOS_END >> PAGE_SHIFT)) - return 0; - - for (i = 0; i < e820.nr_map; i++) { - /* - * Not usable memory: - */ - if (e820.map[i].type != E820_RAM) - continue; - addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT; - end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT; - - - if ((pagenr >= addr) && (pagenr < end)) - return 1; - } - return 0; -} - /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. @@ -99,8 +63,8 @@ int ioremap_change_attr(unsigned long va static void __iomem *__ioremap_caller(resource_size_t phys_addr, unsigned long size, unsigned long prot_val, void *caller) { - unsigned long pfn, offset, vaddr; - resource_size_t last_addr; + unsigned long offset, vaddr; + resource_size_t pfn, last_pfn, last_addr; const resource_size_t unaligned_phys_addr = phys_addr; const unsigned long unaligned_size = size; struct vm_struct *area; @@ -137,10 +101,9 @@ static void __iomem *__ioremap_caller(re /* * Don't allow anybody to remap normal RAM that we're using.. */ - for (pfn = phys_addr >> PAGE_SHIFT; - (pfn << PAGE_SHIFT) < (last_addr & PAGE_MASK); - pfn++) { + last_pfn = last_addr >> PAGE_SHIFT; + for (pfn = phys_addr >> PAGE_SHIFT; pfn <= last_pfn; pfn++) { int is_ram = page_is_ram(pfn); if (is_ram && pfn_valid(pfn) && !PageReserved(pfn_to_page(pfn))) @@ -152,7 +115,7 @@ static void __iomem *__ioremap_caller(re * Mappings have to be page-aligned */ offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; + phys_addr &= PHYSICAL_PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; retval = reserve_memtype(phys_addr, (u64)phys_addr + size, diff -uprN linux-2.6.32/arch/x86/mm/k8topology_64.c linux-2.6.32.ovz/arch/x86/mm/k8topology_64.c --- linux-2.6.32/arch/x86/mm/k8topology_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/k8topology_64.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,222 +0,0 @@ -/* - * AMD K8 NUMA support. - * Discover the memory map and associated nodes. - * - * This version reads it directly from the K8 northbridge. - * - * Copyright 2002,2003 Andi Kleen, SuSE Labs. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -static __init int find_northbridge(void) -{ - int num; - - for (num = 0; num < 32; num++) { - u32 header; - - header = read_pci_config(0, num, 0, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1100<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1200<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1300<<16))) - continue; - - header = read_pci_config(0, num, 1, 0x00); - if (header != (PCI_VENDOR_ID_AMD | (0x1101<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1201<<16)) && - header != (PCI_VENDOR_ID_AMD | (0x1301<<16))) - continue; - return num; - } - - return -1; -} - -static __init void early_get_boot_cpu_id(void) -{ - /* - * need to get boot_cpu_id so can use that to create apicid_to_node - * in k8_scan_nodes() - */ - /* - * Find possible boot-time SMP configuration: - */ -#ifdef CONFIG_X86_MPPARSE - early_find_smp_config(); -#endif -#ifdef CONFIG_ACPI - /* - * Read APIC information from ACPI tables. - */ - early_acpi_boot_init(); -#endif -#ifdef CONFIG_X86_MPPARSE - /* - * get boot-time SMP configuration: - */ - if (smp_found_config) - early_get_smp_config(); -#endif - early_init_lapic_mapping(); -} - -int __init k8_scan_nodes(unsigned long start, unsigned long end) -{ - unsigned numnodes, cores, bits, apicid_base; - unsigned long prevbase; - struct bootnode nodes[8]; - int i, j, nb, found = 0; - u32 nodeid, reg; - - if (!early_pci_allowed()) - return -1; - - nb = find_northbridge(); - if (nb < 0) - return nb; - - printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); - - reg = read_pci_config(0, nb, 0, 0x60); - numnodes = ((reg >> 4) & 0xF) + 1; - if (numnodes <= 1) - return -1; - - printk(KERN_INFO "Number of nodes %d\n", numnodes); - - memset(&nodes, 0, sizeof(nodes)); - prevbase = 0; - for (i = 0; i < 8; i++) { - unsigned long base, limit; - - base = read_pci_config(0, nb, 1, 0x40 + i*8); - limit = read_pci_config(0, nb, 1, 0x44 + i*8); - - nodeid = limit & 7; - if ((base & 3) == 0) { - if (i < numnodes) - printk("Skipping disabled node %d\n", i); - continue; - } - if (nodeid >= numnodes) { - printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, - base, limit); - continue; - } - - if (!limit) { - printk(KERN_INFO "Skipping node entry %d (base %lx)\n", - i, base); - continue; - } - if ((base >> 8) & 3 || (limit >> 8) & 3) { - printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", - nodeid, (base>>8)&3, (limit>>8) & 3); - return -1; - } - if (node_isset(nodeid, node_possible_map)) { - printk(KERN_INFO "Node %d already present. Skipping\n", - nodeid); - continue; - } - - limit >>= 16; - limit <<= 24; - limit |= (1<<24)-1; - limit++; - - if (limit > max_pfn << PAGE_SHIFT) - limit = max_pfn << PAGE_SHIFT; - if (limit <= base) - continue; - - base >>= 16; - base <<= 24; - - if (base < start) - base = start; - if (limit > end) - limit = end; - if (limit == base) { - printk(KERN_ERR "Empty node %d\n", nodeid); - continue; - } - if (limit < base) { - printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", - nodeid, base, limit); - continue; - } - - /* Could sort here, but pun for now. Should not happen anyroads. */ - if (prevbase > base) { - printk(KERN_ERR "Node map not sorted %lx,%lx\n", - prevbase, base); - return -1; - } - - printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", - nodeid, base, limit); - - found++; - - nodes[nodeid].start = base; - nodes[nodeid].end = limit; - - prevbase = base; - - node_set(nodeid, node_possible_map); - } - - if (!found) - return -1; - - memnode_shift = compute_hash_shift(nodes, 8, NULL); - if (memnode_shift < 0) { - printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); - return -1; - } - printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); - - /* use the coreid bits from early_identify_cpu */ - bits = boot_cpu_data.x86_coreid_bits; - cores = (1< 0) { - printk(KERN_INFO "BSP APIC ID: %02x\n", - boot_cpu_physical_apicid); - apicid_base = boot_cpu_physical_apicid; - } - - for (i = 0; i < 8; i++) { - if (nodes[i].start == nodes[i].end) - continue; - - e820_register_active_regions(i, - nodes[i].start >> PAGE_SHIFT, - nodes[i].end >> PAGE_SHIFT); - for (j = apicid_base; j < cores + apicid_base; j++) - apicid_to_node[(i << bits) + j] = i; - setup_node_bootmem(i, nodes[i].start, nodes[i].end); - } - - numa_init_array(); - return 0; -} diff -uprN linux-2.6.32/arch/x86/mm/kmemcheck/error.c linux-2.6.32.ovz/arch/x86/mm/kmemcheck/error.c --- linux-2.6.32/arch/x86/mm/kmemcheck/error.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/kmemcheck/error.c 2015-03-10 13:23:22.000000000 -0700 @@ -186,7 +186,7 @@ void kmemcheck_error_save(enum kmemcheck e->trace.entries = e->trace_entries; e->trace.max_entries = ARRAY_SIZE(e->trace_entries); e->trace.skip = 0; - save_stack_trace_bp(&e->trace, regs->bp); + save_stack_trace_regs(regs, &e->trace); /* Round address down to nearest 16 bytes */ shadow_copy = kmemcheck_shadow_lookup(address diff -uprN linux-2.6.32/arch/x86/mm/Makefile linux-2.6.32.ovz/arch/x86/mm/Makefile --- linux-2.6.32/arch/x86/mm/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/Makefile 2015-03-10 13:22:22.000000000 -0700 @@ -22,7 +22,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mo obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o -obj-$(CONFIG_K8_NUMA) += k8topology_64.o +obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_TRACK_DIRTY_PAGES) += track.o obj-$(CONFIG_MEMTEST) += memtest.o diff -uprN linux-2.6.32/arch/x86/mm/mmap.c linux-2.6.32.ovz/arch/x86/mm/mmap.c --- linux-2.6.32/arch/x86/mm/mmap.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/mmap.c 2015-06-23 04:16:16.000000000 -0700 @@ -29,8 +29,13 @@ #include #include #include +#include #include +struct __read_mostly va_alignment va_align = { + .flags = -1, +}; + static unsigned int stack_maxrandom_size(void) { unsigned int max = 0; @@ -51,21 +56,6 @@ static unsigned int stack_maxrandom_size #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) #define MAX_GAP (TASK_SIZE/6*5) -/* - * True on X86_32 or when emulating IA32 on X86_64 - */ -static int mmap_is_ia32(void) -{ -#ifdef CONFIG_X86_32 - return 1; -#endif -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - return 1; -#endif - return 0; -} - static int mmap_is_legacy(void) { if (current->personality & ADDR_COMPAT_LAYOUT) @@ -87,9 +77,9 @@ static unsigned long mmap_rnd(void) */ if (current->flags & PF_RANDOMIZE) { if (mmap_is_ia32()) - rnd = (long)get_random_int() % (1<<8); + rnd = get_random_int() % (1<<8); else - rnd = (long)(get_random_int() % (1<<28)); + rnd = get_random_int() % (1<<28); } return rnd << PAGE_SHIFT; } @@ -106,6 +96,8 @@ static unsigned long mmap_base(void) return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd()); } +#define SHLIB_BASE 0x00110000 + /* * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64 * does, but not when emulating X86_32 @@ -124,13 +116,21 @@ static unsigned long mmap_legacy_base(vo */ void arch_pick_mmap_layout(struct mm_struct *mm) { - if (mmap_is_legacy()) { + mm->get_unmapped_exec_area = NULL; + if (!(2 & exec_shield) && mmap_is_legacy()) { mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; + if ((16 & exec_shield) && + !(current->personality & READ_IMPLIES_EXEC) + && mmap_is_ia32()) { + mm->get_unmapped_exec_area = arch_get_unmapped_exec_area; + mm->shlib_base = SHLIB_BASE + mmap_rnd(); + } mm->unmap_area = arch_unmap_area_topdown; } } +EXPORT_SYMBOL_GPL(arch_pick_mmap_layout); diff -uprN linux-2.6.32/arch/x86/mm/numa_64.c linux-2.6.32.ovz/arch/x86/mm/numa_64.c --- linux-2.6.32/arch/x86/mm/numa_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/numa_64.c 2015-03-10 13:24:13.000000000 -0700 @@ -18,7 +18,7 @@ #include #include #include -#include +#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -187,6 +187,7 @@ setup_node_bootmem(int nodeid, unsigned unsigned long bootmap_start, nodedata_phys; void *bootmap; int nid; + unsigned long cache_alias_offset; if (!end) return; @@ -198,17 +199,22 @@ setup_node_bootmem(int nodeid, unsigned if (end && (end - start) < NODE_MIN_SIZE) return; - start = roundup(start, ZONE_ALIGN); - printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end); start_pfn = start >> PAGE_SHIFT; last_pfn = end >> PAGE_SHIFT; - node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size, + /* + * Allocate an extra cacheline per node to reduce cacheline + * aliasing when scanning all node's node_data. + */ + cache_alias_offset = nodeid * SMP_CACHE_BYTES; + node_data[nodeid] = cache_alias_offset + + early_node_mem(nodeid, start, end, + pgdat_size + cache_alias_offset, SMP_CACHE_BYTES); - if (node_data[nodeid] == NULL) + if (node_data[nodeid] == (void *)cache_alias_offset) return; nodedata_phys = __pa(node_data[nodeid]); printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, @@ -216,8 +222,6 @@ setup_node_bootmem(int nodeid, unsigned memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; - NODE_DATA(nodeid)->node_start_pfn = start_pfn; - NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; /* * Find a place for the bootmem map @@ -546,8 +550,8 @@ void __init initmem_init(unsigned long s nodes_clear(node_online_map); #endif -#ifdef CONFIG_K8_NUMA - if (!numa_off && !k8_scan_nodes(start_pfn<> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; +#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) + /* + * Once the kernel maps the text as RO (kernel_set_to_readonly is set), + * kernel text mappings for the large page aligned text, rodata sections + * will be always read-only. For the kernel identity mappings covering + * the holes caused by this alignment can be anything that user asks. + + * + * This will preserve the large page mappings for kernel text/data + * at no extra cost. + */ + if (kernel_set_to_readonly && + within(address, (unsigned long)_text, + (unsigned long)__end_rodata_hpage_align)) { + unsigned int level; + + /* + * Don't enforce the !RW mapping for the kernel text mapping, + * if the current mapping is already using small page mapping. + * No need to work hard to preserve large page mappings in this + * case. + * + * This also fixes the Linux Xen paravirt guest boot failure + * (because of unexpected read-only mappings for kernel identity + * mappings). In this paravirt guest case, the kernel text + * mapping and the kernel identity mapping share the same + * page-table pages. Thus we can't really use different + * protections for the kernel text and identity mappings. Also, + * these shared mappings are made of small page mappings. + * Thus this don't enforce !RW mapping for small page kernel + * text mapping logic will help Linux Xen parvirt guest boot + * aswell. + */ + if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) + pgprot_val(forbidden) |= _PAGE_RW; + } +#endif + prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); return prot; @@ -354,7 +390,7 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; + unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; int i, do_split = 1; @@ -363,7 +399,7 @@ try_preserve_large_page(pte_t *kpte, uns if (cpa->force_split) return 1; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up already: @@ -407,6 +443,19 @@ try_preserve_large_page(pte_t *kpte, uns pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); /* + * Set the PSE and GLOBAL flags only if the PRESENT flag is + * set otherwise pmd_present/pmd_huge will return true even on + * a non present pmd. The canon_pgprot will clear _PAGE_GLOBAL + * for the ancient hardware that doesn't support it. + */ + if (pgprot_val(new_prot) & _PAGE_PRESENT) + pgprot_val(new_prot) |= _PAGE_PSE | _PAGE_GLOBAL; + else + pgprot_val(new_prot) &= ~(_PAGE_PSE | _PAGE_GLOBAL); + + new_prot = canon_pgprot(new_prot); + + /* * old_pte points to the large page base address. So we need * to add the offset of the virtual address: */ @@ -451,21 +500,21 @@ try_preserve_large_page(pte_t *kpte, uns * The address is aligned and the number of pages * covers the full page. */ - new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); + new_pte = pfn_pte(pte_pfn(old_pte), new_prot); __set_pmd_pte(kpte, address, new_pte); cpa->flags |= CPA_FLUSHTLB; do_split = 0; } out_unlock: - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return do_split; } static int split_large_page(pte_t *kpte, unsigned long address) { - unsigned long flags, pfn, pfninc = 1; + unsigned long pfn, pfninc = 1; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; @@ -479,7 +528,7 @@ static int split_large_page(pte_t *kpte, if (!base) return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); /* * Check for races, another CPU might have split this page * up for us already: @@ -502,16 +551,35 @@ static int split_large_page(pte_t *kpte, #ifdef CONFIG_X86_64 if (level == PG_LEVEL_1G) { pfninc = PMD_PAGE_SIZE >> PAGE_SHIFT; - pgprot_val(ref_prot) |= _PAGE_PSE; + /* + * Set the PSE flags only if the PRESENT flag is set + * otherwise pmd_present/pmd_huge will return true + * even on a non present pmd. + */ + if (pgprot_val(ref_prot) & _PAGE_PRESENT) + pgprot_val(ref_prot) |= _PAGE_PSE; + else + pgprot_val(ref_prot) &= ~_PAGE_PSE; } #endif /* + * Set the GLOBAL flags only if the PRESENT flag is set + * otherwise pmd/pte_present will return true even on a non + * present pmd/pte. The canon_pgprot will clear _PAGE_GLOBAL + * for the ancient hardware that doesn't support it. + */ + if (pgprot_val(ref_prot) & _PAGE_PRESENT) + pgprot_val(ref_prot) |= _PAGE_GLOBAL; + else + pgprot_val(ref_prot) &= ~_PAGE_GLOBAL; + + /* * Get the target pfn from the original entry: */ pfn = pte_pfn(*kpte); for (i = 0; i < PTRS_PER_PTE; i++, pfn += pfninc) - set_pte(&pbase[i], pfn_pte(pfn, ref_prot)); + set_pte(&pbase[i], pfn_pte(pfn, canon_pgprot(ref_prot))); if (address >= (unsigned long)__va(0) && address < (unsigned long)__va(max_low_pfn_mapped << PAGE_SHIFT)) @@ -551,7 +619,7 @@ out_unlock: */ if (base) __free_page(base); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); return 0; } @@ -622,6 +690,18 @@ repeat: new_prot = static_protections(new_prot, address, pfn); /* + * Set the GLOBAL flags only if the PRESENT flag is + * set otherwise pte_present will return true even on + * a non present pte. The canon_pgprot will clear + * _PAGE_GLOBAL for the ancient hardware that doesn't + * support it. + */ + if (pgprot_val(new_prot) & _PAGE_PRESENT) + pgprot_val(new_prot) |= _PAGE_GLOBAL; + else + pgprot_val(new_prot) &= ~_PAGE_GLOBAL; + + /* * We need to keep the pfn from the existing PTE, * after all we're only going to change it's attributes * not the memory it points to @@ -960,7 +1040,8 @@ out_err: } EXPORT_SYMBOL(set_memory_uc); -int set_memory_array_uc(unsigned long *addr, int addrinarray) +int _set_memory_array(unsigned long *addr, int addrinarray, + unsigned long new_type) { int i, j; int ret; @@ -970,13 +1051,19 @@ int set_memory_array_uc(unsigned long *a */ for (i = 0; i < addrinarray; i++) { ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL); + new_type, NULL); if (ret) goto out_free; } ret = change_page_attr_set(addr, addrinarray, __pgprot(_PAGE_CACHE_UC_MINUS), 1); + + if (!ret && new_type == _PAGE_CACHE_WC) + ret = change_page_attr_set_clr(addr, addrinarray, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, CPA_ARRAY, NULL); if (ret) goto out_free; @@ -988,8 +1075,19 @@ out_free: return ret; } + +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_UC_MINUS); +} EXPORT_SYMBOL(set_memory_array_uc); +int set_memory_array_wc(unsigned long *addr, int addrinarray) +{ + return _set_memory_array(addr, addrinarray, _PAGE_CACHE_WC); +} +EXPORT_SYMBOL(set_memory_array_wc); + int _set_memory_wc(unsigned long addr, int numpages) { int ret; @@ -1110,26 +1208,34 @@ int set_pages_uc(struct page *page, int } EXPORT_SYMBOL(set_pages_uc); -int set_pages_array_uc(struct page **pages, int addrinarray) +static int _set_pages_array(struct page **pages, int addrinarray, + unsigned long new_type) { unsigned long start; unsigned long end; int i; int free_idx; + int ret; for (i = 0; i < addrinarray; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; end = start + PAGE_SIZE; - if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) + if (reserve_memtype(start, end, new_type, NULL)) goto err_out; } - if (cpa_set_pages_array(pages, addrinarray, - __pgprot(_PAGE_CACHE_UC_MINUS)) == 0) { - return 0; /* Success */ - } + ret = cpa_set_pages_array(pages, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS)); + if (!ret && new_type == _PAGE_CACHE_WC) + ret = change_page_attr_set_clr(NULL, addrinarray, + __pgprot(_PAGE_CACHE_WC), + __pgprot(_PAGE_CACHE_MASK), + 0, CPA_PAGES_ARRAY, pages); + if (ret) + goto err_out; + return 0; /* Success */ err_out: free_idx = i; for (i = 0; i < free_idx; i++) { @@ -1141,8 +1247,19 @@ err_out: } return -EINVAL; } + +int set_pages_array_uc(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_UC_MINUS); +} EXPORT_SYMBOL(set_pages_array_uc); +int set_pages_array_wc(struct page **pages, int addrinarray) +{ + return _set_pages_array(pages, addrinarray, _PAGE_CACHE_WC); +} +EXPORT_SYMBOL(set_pages_array_wc); + int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); diff -uprN linux-2.6.32/arch/x86/mm/pat.c linux-2.6.32.ovz/arch/x86/mm/pat.c --- linux-2.6.32/arch/x86/mm/pat.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/pat.c 2015-03-10 13:21:40.000000000 -0700 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -263,7 +264,7 @@ chk_conflict(struct memtype *new, struct return -EBUSY; } -static int pat_pagerange_is_ram(unsigned long start, unsigned long end) +static int pat_pagerange_is_ram(resource_size_t start, resource_size_t end) { int ram_page = 0, not_rampage = 0; unsigned long page_nr; @@ -295,8 +296,6 @@ static int pat_pagerange_is_ram(unsigned * Here we do two pass: * - Find the memtype of all the pages in the range, look for any conflicts * - In case of no conflicts, set the new memtype for pages in the range - * - * Caller must hold memtype_lock for atomicity. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, unsigned long *new_type) @@ -388,7 +387,7 @@ int reserve_memtype(u64 start, u64 end, } /* Low ISA region is always mapped WB in page table. No need to track */ - if (is_ISA_range(start, end - 1)) { + if (x86_platform.is_untracked_pat_range(start, end)) { if (new_type) *new_type = _PAGE_CACHE_WB; return 0; @@ -408,9 +407,7 @@ int reserve_memtype(u64 start, u64 end, is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) { - spin_lock(&memtype_lock); err = reserve_ram_pages_type(start, end, req_type, new_type); - spin_unlock(&memtype_lock); return err; } else if (is_range_ram < 0) { @@ -499,15 +496,13 @@ int free_memtype(u64 start, u64 end) return 0; /* Low ISA region is always mapped WB. No need to track */ - if (is_ISA_range(start, end - 1)) + if (x86_platform.is_untracked_pat_range(start, end)) return 0; is_range_ram = pat_pagerange_is_ram(start, end); if (is_range_ram == 1) { - spin_lock(&memtype_lock); err = free_ram_pages_type(start, end); - spin_unlock(&memtype_lock); return err; } else if (is_range_ram < 0) { @@ -582,15 +577,13 @@ static unsigned long lookup_memtype(u64 int rettype = _PAGE_CACHE_WB; struct memtype *entry; - if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) + if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE)) return rettype; if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { struct page *page; - spin_lock(&memtype_lock); page = pfn_to_page(paddr >> PAGE_SHIFT); rettype = get_page_memtype(page); - spin_unlock(&memtype_lock); /* * -1 from get_page_memtype() implies RAM page is in its * default state and not reserved, and hence of type WB diff -uprN linux-2.6.32/arch/x86/mm/pgtable.c linux-2.6.32.ovz/arch/x86/mm/pgtable.c --- linux-2.6.32/arch/x86/mm/pgtable.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/pgtable.c 2015-06-23 04:16:16.000000000 -0700 @@ -4,27 +4,51 @@ #include #include +#include + #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#define PGALLOC_KERN_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO + +#ifdef CONFIG_HIGHPTE +#define PGALLOC_USER_GFP __GFP_HIGHMEM +#else +#define PGALLOC_USER_GFP 0 +#endif + +gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP; pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(PGALLOC_GFP); + return (pte_t *)__get_free_page(PGALLOC_KERN_GFP); } pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0); -#else - pte = alloc_pages(PGALLOC_GFP, 0); -#endif + pte = alloc_pages(__userpte_alloc_gfp, 0); if (pte) pgtable_page_ctor(pte); return pte; } +static int __init setup_userpte(char *arg) +{ + if (!arg) + return -EINVAL; + + /* + * "userpte=nohigh" disables allocation of user pagetables in + * high memory. + */ + if (strcmp(arg, "nohigh") == 0) + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + else + return -EINVAL; + return 0; +} +early_param("userpte", setup_userpte); + void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); @@ -65,7 +89,19 @@ static inline void pgd_list_del(pgd_t *p #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(pgd_t *pgd) + +static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); + virt_to_page(pgd)->index = (pgoff_t)mm; +} + +struct mm_struct *pgd_page_get_mm(struct page *page) +{ + return (struct mm_struct *)page->index; +} + +static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) { /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the @@ -83,20 +119,20 @@ static void pgd_ctor(pgd_t *pgd) } /* list required to sync kernel mapping updates */ - if (!SHARED_KERNEL_PMD) + if (!SHARED_KERNEL_PMD) { + pgd_set_mm(pgd, mm); pgd_list_add(pgd); + } } static void pgd_dtor(pgd_t *pgd) { - unsigned long flags; /* can be called from interrupt context */ - if (SHARED_KERNEL_PMD) return; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); pgd_list_del(pgd); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -138,8 +174,7 @@ void pud_populate(struct mm_struct *mm, * section 8.1: in PAE mode we explicitly have to flush the * TLB via cr3 if the top-level pgd is changed... */ - if (mm == current->active_mm) - write_cr3(read_cr3()); + flush_tlb_mm(mm); } #else /* !CONFIG_X86_PAE */ @@ -197,6 +232,7 @@ static void pgd_mop_up_pmds(struct mm_st paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); + mm->nr_ptds--; } } } @@ -221,6 +257,8 @@ static void pgd_prepopulate_pmd(struct m sizeof(pmd_t) * PTRS_PER_PMD); pud_populate(mm, pud, pmd); + ub_page_table_charge(mm, 0); + mm->nr_ptds++; } } @@ -228,7 +266,9 @@ pgd_t *pgd_alloc(struct mm_struct *mm) { pgd_t *pgd; pmd_t *pmds[PREALLOCATED_PMDS]; - unsigned long flags; + + if (ub_page_table_precharge(mm, 1 + PREALLOCATED_PMDS)) + return NULL; pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); @@ -248,12 +288,15 @@ pgd_t *pgd_alloc(struct mm_struct *mm) * respect to anything walking the pgd_list, so that they * never see a partially populated pgd. */ - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); - pgd_ctor(pgd); + pgd_ctor(mm, pgd); pgd_prepopulate_pmd(mm, pgd, pmds); - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); + + ub_page_table_charge(mm, 0); + mm->nr_ptds++; return pgd; @@ -262,6 +305,7 @@ out_free_pmds: out_free_pgd: free_page((unsigned long)pgd); out: + ub_page_table_commit(mm); return NULL; } @@ -271,6 +315,7 @@ void pgd_free(struct mm_struct *mm, pgd_ pgd_dtor(pgd); paravirt_pgd_free(mm, pgd); free_page((unsigned long)pgd); + mm->nr_ptds--; } int ptep_set_access_flags(struct vm_area_struct *vma, @@ -288,6 +333,25 @@ int ptep_set_access_flags(struct vm_area return changed; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int changed = !pmd_same(*pmdp, entry); + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + if (changed && dirty) { + *pmdp = entry; + pmd_update_defer(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + + return changed; +} +#endif + int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -302,6 +366,24 @@ int ptep_test_and_clear_young(struct vm_ return ret; } +EXPORT_SYMBOL(ptep_test_and_clear_young); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + int ret = 0; + + if (pmd_young(*pmdp)) + ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, + (unsigned long *) &pmdp->pmd); + + if (ret) + pmd_update(vma->vm_mm, addr, pmdp); + + return ret; +} +#endif int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) @@ -315,6 +397,36 @@ int ptep_clear_flush_young(struct vm_are return young; } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + return young; +} + +void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int set; + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set = !test_and_set_bit(_PAGE_BIT_SPLITTING, + (unsigned long *)&pmdp->pmd); + if (set) { + pmd_update(vma->vm_mm, address, pmdp); + /* need tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } +} +#endif + /** * reserve_top_address - reserves a hole in the top of kernel address space * @reserve - size of hole to reserve diff -uprN linux-2.6.32/arch/x86/mm/setup_nx.c linux-2.6.32.ovz/arch/x86/mm/setup_nx.c --- linux-2.6.32/arch/x86/mm/setup_nx.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/setup_nx.c 2015-03-10 13:20:57.000000000 -0700 @@ -1,3 +1,4 @@ +#include #include #include #include @@ -27,6 +28,9 @@ static int __init noexec_setup(char *str } else if (!strncmp(str, "off", 3)) { disable_nx = 1; __supported_pte_mask &= ~_PAGE_NX; +#ifdef CONFIG_X86_32 + exec_shield = 0; +#endif } return 0; } diff -uprN linux-2.6.32/arch/x86/mm/srat_32.c linux-2.6.32.ovz/arch/x86/mm/srat_32.c --- linux-2.6.32/arch/x86/mm/srat_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/srat_32.c 2015-03-10 13:22:27.000000000 -0700 @@ -91,6 +91,7 @@ acpi_numa_processor_affinity_init(struct /* mark this node as "seen" in node bitmap */ BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo); + /* don't need to check apic_id here, because it is always 8 bits */ apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo; printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n", @@ -267,6 +268,8 @@ int __init get_memcfg_from_srat(void) e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } + /* for out of order entries in SRAT */ + sort_node_map(); for_each_online_node(nid) { unsigned long start = node_start_pfn[nid]; diff -uprN linux-2.6.32/arch/x86/mm/srat_64.c linux-2.6.32.ovz/arch/x86/mm/srat_64.c --- linux-2.6.32/arch/x86/mm/srat_64.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/srat_64.c 2015-03-10 13:23:54.000000000 -0700 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -133,6 +134,10 @@ acpi_numa_x2apic_affinity_init(struct ac } apic_id = pa->apic_id; + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; @@ -167,6 +172,12 @@ acpi_numa_processor_affinity_init(struct apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; + + if (apic_id >= MAX_LOCAL_APIC) { + printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); + return; + } + apicid_to_node[apic_id] = node; node_set(node, cpu_nodes_parsed); acpi_numa = 1; @@ -229,9 +240,11 @@ update_nodes_add(int node, unsigned long printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - if (changed) + if (changed) { + node_set(node, cpu_nodes_parsed); printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); + } } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -242,6 +255,7 @@ acpi_numa_memory_affinity_init(struct ac unsigned long start, end; int node, pxm; int i; + const char *dmi_product_name; if (srat_disabled()) return; @@ -293,7 +307,10 @@ acpi_numa_memory_affinity_init(struct ac e820_register_active_regions(node, start >> PAGE_SHIFT, end >> PAGE_SHIFT); - if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { + dmi_product_name = dmi_get_system_info(DMI_PRODUCT_NAME); + if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE && + (!dmi_match(DMI_SYS_VENDOR, "FUJITSU") || + !strstr(dmi_product_name, "PRIMEQUEST"))) { update_nodes_add(node, start, end); /* restore nodes[node] */ *nd = oldnode; @@ -319,7 +336,7 @@ static int __init nodes_cover_memory(con unsigned long s = nodes[i].start >> PAGE_SHIFT; unsigned long e = nodes[i].end >> PAGE_SHIFT; pxmram += e - s; - pxmram -= absent_pages_in_range(s, e); + pxmram -= __absent_pages_in_range(i, s, e); if ((long)pxmram < 0) pxmram = 0; } @@ -350,6 +367,8 @@ int __init acpi_scan_nodes(unsigned long for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); + /* for out of order entries in SRAT */ + sort_node_map(); if (!nodes_cover_memory(nodes)) { bad_srat(); return -1; diff -uprN linux-2.6.32/arch/x86/mm/tlb.c linux-2.6.32.ovz/arch/x86/mm/tlb.c --- linux-2.6.32/arch/x86/mm/tlb.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/tlb.c 2015-06-23 04:16:16.000000000 -0700 @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -130,6 +131,12 @@ void smp_invalidate_interrupt(struct pt_ union smp_flush_state *f; cpu = smp_processor_id(); + +#ifdef CONFIG_X86_32 + if (current->active_mm) + load_user_cs_desc(cpu, current->active_mm); +#endif + /* * orig_rax contains the negated interrupt vector. * Use that to determine where the sender put the data. @@ -255,6 +262,7 @@ void flush_tlb_mm(struct mm_struct *mm) preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_mm); void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) { @@ -274,6 +282,7 @@ void flush_tlb_page(struct vm_area_struc preempt_enable(); } +EXPORT_SYMBOL(flush_tlb_page); static void do_flush_tlb_all(void *info) { diff -uprN linux-2.6.32/arch/x86/mm/track.c linux-2.6.32.ovz/arch/x86/mm/track.c --- linux-2.6.32/arch/x86/mm/track.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/mm/track.c 2015-03-10 13:21:12.000000000 -0700 @@ -0,0 +1,160 @@ +/* + * Low-level routines for marking dirty pages of a running system in a + * bitmap. Allows memory mirror or migration strategies to be implemented. + * + * Copyright (C) 2006, 2010 Stratus Technologies Bermuda Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * For memory-tracking purposes, see mm_track.h for details. + */ +struct mm_tracker mm_tracking_struct = {0, ATOMIC_INIT(0), 0, 0}; +EXPORT_SYMBOL_GPL(mm_tracking_struct); + +void do_mm_track_pte(void *val) +{ + pte_t *ptep = (pte_t *)val; + unsigned long pfn; + + if (!pte_present(*ptep)) + return; + + if (!(pte_val(*ptep) & _PAGE_DIRTY)) + return; + + pfn = pte_pfn(*ptep); + + if (pfn >= mm_tracking_struct.bitcnt) + return; + + if (!test_and_set_bit(pfn, mm_tracking_struct.vector)) + atomic_inc(&mm_tracking_struct.count); +} + +#define LARGE_PMD_SIZE (1 << PMD_SHIFT) + +void do_mm_track_pmd(void *val) +{ + int i; + pte_t *pte; + pmd_t *pmd = (pmd_t *)val; + + if (!pmd_present(*pmd)) + return; + + if (unlikely(pmd_large(*pmd))) { + unsigned long addr, end; + + if (!(pte_val(*(pte_t *)val) & _PAGE_DIRTY)) + return; + + addr = pte_pfn(*(pte_t *)val) << PAGE_SHIFT; + end = addr + LARGE_PMD_SIZE; + + while (addr < end) { + do_mm_track_phys((void *)addr); + addr += PAGE_SIZE; + } + return; + } + + pte = pte_offset_kernel(pmd, 0); + + for (i = 0; i < PTRS_PER_PTE; i++, pte++) + do_mm_track_pte(pte); +} + +static inline void track_as_pte(void *val) +{ + unsigned long pfn = pte_pfn(*(pte_t *)val); + if (pfn >= mm_tracking_struct.bitcnt) + return; + + if (!test_and_set_bit(pfn, mm_tracking_struct.vector)) + atomic_inc(&mm_tracking_struct.count); +} + +void do_mm_track_pud(void *val) +{ + track_as_pte(val); +} + +void do_mm_track_pgd(void *val) +{ + track_as_pte(val); +} + +void do_mm_track_phys(void *val) +{ + unsigned long pfn; + + pfn = (unsigned long)val >> PAGE_SHIFT; + + if (pfn >= mm_tracking_struct.bitcnt) + return; + + if (!test_and_set_bit(pfn, mm_tracking_struct.vector)) + atomic_inc(&mm_tracking_struct.count); +} +EXPORT_SYMBOL_GPL(do_mm_track_phys); + + +/* + * Allocate enough space for the bit vector in the + * mm_tracking_struct. + */ +int mm_track_init(long num_pages) +{ + mm_tracking_struct.vector = vmalloc((num_pages + 7)/8); + if (mm_tracking_struct.vector == NULL) { + printk(KERN_WARNING + "%s: failed to allocate bit vector\n", __func__); + return -ENOMEM; + } + + mm_tracking_struct.bitcnt = num_pages; + + return 0; +} +EXPORT_SYMBOL_GPL(mm_track_init); + +/* + * Turn off tracking, free the bit vector memory. This function should + * ONLY be called with interrupts disabled and all other CPUs quiesced + */ +void mm_track_exit(void) +{ + /* + * Inhibit the use of the tracking functions. + * This should have already been done, but just in case. + */ + mm_tracking_struct.active = 0; + mm_tracking_struct.bitcnt = 0; + + if (mm_tracking_struct.vector != NULL) + vfree(mm_tracking_struct.vector); +} +EXPORT_SYMBOL_GPL(mm_track_exit); + diff -uprN linux-2.6.32/arch/x86/oprofile/backtrace.c linux-2.6.32.ovz/arch/x86/oprofile/backtrace.c --- linux-2.6.32/arch/x86/oprofile/backtrace.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/backtrace.c 2015-03-10 13:22:06.000000000 -0700 @@ -14,6 +14,7 @@ #include #include #include +#include static void backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) @@ -41,20 +42,19 @@ static void backtrace_address(void *data } static struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, + .walk_stack = print_context_stack, }; -struct frame_head { - struct frame_head *bp; - unsigned long ret; -} __attribute__((packed)); - -static struct frame_head *dump_user_backtrace(struct frame_head *head) +#ifdef CONFIG_COMPAT +static struct stack_frame_ia32 * +dump_user_backtrace_32(struct stack_frame_ia32 *head) { - struct frame_head bufhead[2]; + struct stack_frame_ia32 bufhead[2]; + struct stack_frame_ia32 *fp; /* Also check accessibility of one struct frame_head beyond */ if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) @@ -62,29 +62,78 @@ static struct frame_head *dump_user_back if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) return NULL; - oprofile_add_trace(bufhead[0].ret); + fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame); + + oprofile_add_trace(bufhead[0].return_address); + + /* frame pointers should strictly progress back up the stack + * (towards higher addresses) */ + if (head >= fp) + return NULL; + + return fp; +} + +static inline int +x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) +{ + struct stack_frame_ia32 *head; + + /* User process is 32-bit */ + if (!current || !test_thread_flag(TIF_IA32)) + return 0; + + head = (struct stack_frame_ia32 *) regs->bp; + while (depth-- && head) + head = dump_user_backtrace_32(head); + + return 1; +} + +#else +static inline int +x86_backtrace_32(struct pt_regs * const regs, unsigned int depth) +{ + return 0; +} +#endif /* CONFIG_COMPAT */ + +static struct stack_frame *dump_user_backtrace(struct stack_frame *head) +{ + struct stack_frame bufhead[2]; + + /* Also check accessibility of one struct stack_frame beyond */ + if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) + return NULL; + if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) + return NULL; + + oprofile_add_trace(bufhead[0].return_address); /* frame pointers should strictly progress back up the stack * (towards higher addresses) */ - if (head >= bufhead[0].bp) + if (head >= bufhead[0].next_frame) return NULL; - return bufhead[0].bp; + return bufhead[0].next_frame; } void x86_backtrace(struct pt_regs * const regs, unsigned int depth) { - struct frame_head *head = (struct frame_head *)frame_pointer(regs); + struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); if (!user_mode_vm(regs)) { unsigned long stack = kernel_stack_pointer(regs); if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, 0, + dump_trace(NULL, regs, (unsigned long *)stack, &backtrace_ops, &depth); return; } + if (x86_backtrace_32(regs, depth)) + return; + while (depth-- && head) head = dump_user_backtrace(head); } diff -uprN linux-2.6.32/arch/x86/oprofile/init.c linux-2.6.32.ovz/arch/x86/oprofile/init.c --- linux-2.6.32/arch/x86/oprofile/init.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/init.c 2015-03-10 13:23:54.000000000 -0700 @@ -21,6 +21,7 @@ extern int op_nmi_timer_init(struct opro extern void op_nmi_exit(void); extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); +static int nmi_timer; int __init oprofile_arch_init(struct oprofile_operations *ops) { @@ -31,8 +32,9 @@ int __init oprofile_arch_init(struct opr #ifdef CONFIG_X86_LOCAL_APIC ret = op_nmi_init(ops); #endif + nmi_timer = (ret != 0); #ifdef CONFIG_X86_IO_APIC - if (ret < 0) + if (nmi_timer) ret = op_nmi_timer_init(ops); #endif ops->backtrace = x86_backtrace; @@ -44,6 +46,7 @@ int __init oprofile_arch_init(struct opr void oprofile_arch_exit(void) { #ifdef CONFIG_X86_LOCAL_APIC - op_nmi_exit(); + if (!nmi_timer) + op_nmi_exit(); #endif } diff -uprN linux-2.6.32/arch/x86/oprofile/nmi_int.c linux-2.6.32.ovz/arch/x86/oprofile/nmi_int.c --- linux-2.6.32/arch/x86/oprofile/nmi_int.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/nmi_int.c 2015-03-10 13:22:31.000000000 -0700 @@ -31,8 +31,9 @@ static struct op_x86_model_spec *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); -/* 0 == registered but off, 1 == registered and on */ -static int nmi_enabled = 0; +/* must be protected with get_online_cpus()/put_online_cpus(): */ +static int nmi_enabled; +static int ctr_running; struct op_counter_config counter_config[OP_MAX_COUNTER]; @@ -48,6 +49,10 @@ u64 op_x86_get_ctrl(struct op_x86_model_ val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; val |= (counter_config->unit_mask & 0xFF) << 8; + counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV | + ARCH_PERFMON_EVENTSEL_EDGE | + ARCH_PERFMON_EVENTSEL_CMASK); + val |= counter_config->extra; event &= model->event_mask ? model->event_mask : 0xFF; val |= event & 0xFF; val |= (event & 0x0F00) << 24; @@ -61,12 +66,16 @@ static int profile_exceptions_notify(str { struct die_args *args = (struct die_args *)data; int ret = NOTIFY_DONE; - int cpu = smp_processor_id(); switch (val) { case DIE_NMI: case DIE_NMI_IPI: - model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); + if (ctr_running) + model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); + else if (!nmi_enabled) + break; + else + model->stop(&__get_cpu_var(cpu_msrs)); ret = NOTIFY_STOP; break; default: @@ -95,24 +104,36 @@ static void nmi_cpu_save_registers(struc static void nmi_cpu_start(void *dummy) { struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->start(msrs); + if (!msrs->controls) + WARN_ON_ONCE(1); + else + model->start(msrs); } static int nmi_start(void) { + get_online_cpus(); on_each_cpu(nmi_cpu_start, NULL, 1); + ctr_running = 1; + put_online_cpus(); return 0; } static void nmi_cpu_stop(void *dummy) { struct op_msrs const *msrs = &__get_cpu_var(cpu_msrs); - model->stop(msrs); + if (!msrs->controls) + WARN_ON_ONCE(1); + else + model->stop(msrs); } static void nmi_stop(void) { + get_online_cpus(); on_each_cpu(nmi_cpu_stop, NULL, 1); + ctr_running = 0; + put_online_cpus(); } #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX @@ -159,7 +180,7 @@ static int nmi_setup_mux(void) for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).multiplex = - kmalloc(multiplex_size, GFP_KERNEL); + kzalloc(multiplex_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).multiplex) return 0; } @@ -179,7 +200,6 @@ static void nmi_cpu_setup_mux(int cpu, s if (counter_config[i].enabled) { multiplex[i].saved = -(u64)counter_config[i].count; } else { - multiplex[i].addr = 0; multiplex[i].saved = 0; } } @@ -189,25 +209,27 @@ static void nmi_cpu_setup_mux(int cpu, s static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) { + struct op_msr *counters = msrs->counters; struct op_msr *multiplex = msrs->multiplex; int i; for (i = 0; i < model->num_counters; ++i) { int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - rdmsrl(multiplex[virt].addr, multiplex[virt].saved); + if (counters[i].addr) + rdmsrl(counters[i].addr, multiplex[virt].saved); } } static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) { + struct op_msr *counters = msrs->counters; struct op_msr *multiplex = msrs->multiplex; int i; for (i = 0; i < model->num_counters; ++i) { int virt = op_x86_phys_to_virt(i); - if (multiplex[virt].addr) - wrmsrl(multiplex[virt].addr, multiplex[virt].saved); + if (counters[i].addr) + wrmsrl(counters[i].addr, multiplex[virt].saved); } } @@ -222,7 +244,7 @@ static void nmi_cpu_switch(void *dummy) /* move to next set */ si += model->num_counters; - if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) + if ((si >= model->num_virt_counters) || (counter_config[si].count == 0)) per_cpu(switch_index, cpu) = 0; else per_cpu(switch_index, cpu) = si; @@ -251,7 +273,10 @@ static int nmi_switch_event(void) if (nmi_multiplex_on() < 0) return -EINVAL; /* not necessary */ - on_each_cpu(nmi_cpu_switch, NULL, 1); + get_online_cpus(); + if (ctr_running) + on_each_cpu(nmi_cpu_switch, NULL, 1); + put_online_cpus(); return 0; } @@ -294,6 +319,7 @@ static void free_msrs(void) kfree(per_cpu(cpu_msrs, i).controls); per_cpu(cpu_msrs, i).controls = NULL; } + nmi_shutdown_mux(); } static int allocate_msrs(void) @@ -303,17 +329,24 @@ static int allocate_msrs(void) int i; for_each_possible_cpu(i) { - per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, + per_cpu(cpu_msrs, i).counters = kzalloc(counters_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).counters) - return 0; - per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, + goto fail; + per_cpu(cpu_msrs, i).controls = kzalloc(controls_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) - return 0; + goto fail; } + if (!nmi_setup_mux()) + goto fail; + return 1; + +fail: + free_msrs(); + return 0; } static void nmi_cpu_setup(void *dummy) @@ -332,52 +365,9 @@ static void nmi_cpu_setup(void *dummy) static struct notifier_block profile_exceptions_nb = { .notifier_call = profile_exceptions_notify, .next = NULL, - .priority = 2 + .priority = NMI_LOCAL_LOW_PRIOR, }; -static int nmi_setup(void) -{ - int err = 0; - int cpu; - - if (!allocate_msrs()) - err = -ENOMEM; - else if (!nmi_setup_mux()) - err = -ENOMEM; - else - err = register_die_notifier(&profile_exceptions_nb); - - if (err) { - free_msrs(); - nmi_shutdown_mux(); - return err; - } - - /* We need to serialize save and setup for HT because the subset - * of msrs are distinct for save and setup operations - */ - - /* Assume saved/restored counters are the same on all CPUs */ - model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); - for_each_possible_cpu(cpu) { - if (!cpu) - continue; - - memcpy(per_cpu(cpu_msrs, cpu).counters, - per_cpu(cpu_msrs, 0).counters, - sizeof(struct op_msr) * model->num_counters); - - memcpy(per_cpu(cpu_msrs, cpu).controls, - per_cpu(cpu_msrs, 0).controls, - sizeof(struct op_msr) * model->num_controls); - - mux_clone(cpu); - } - on_each_cpu(nmi_cpu_setup, NULL, 1); - nmi_enabled = 1; - return 0; -} - static void nmi_cpu_restore_registers(struct op_msrs *msrs) { struct op_msr *counters = msrs->counters; @@ -411,20 +401,24 @@ static void nmi_cpu_shutdown(void *dummy apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); nmi_cpu_restore_registers(msrs); + if (model->cpu_down) + model->cpu_down(); } -static void nmi_shutdown(void) +static void nmi_cpu_up(void *dummy) { - struct op_msrs *msrs; + if (nmi_enabled) + nmi_cpu_setup(dummy); + if (ctr_running) + nmi_cpu_start(dummy); +} - nmi_enabled = 0; - on_each_cpu(nmi_cpu_shutdown, NULL, 1); - unregister_die_notifier(&profile_exceptions_nb); - nmi_shutdown_mux(); - msrs = &get_cpu_var(cpu_msrs); - model->shutdown(msrs); - free_msrs(); - put_cpu_var(cpu_msrs); +static void nmi_cpu_down(void *dummy) +{ + if (ctr_running) + nmi_cpu_stop(dummy); + if (nmi_enabled) + nmi_cpu_shutdown(dummy); } static int nmi_create_files(struct super_block *sb, struct dentry *root) @@ -451,12 +445,12 @@ static int nmi_create_files(struct super oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra); } return 0; } -#ifdef CONFIG_SMP static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, void *data) { @@ -464,10 +458,10 @@ static int oprofile_cpu_notifier(struct switch (action) { case CPU_DOWN_FAILED: case CPU_ONLINE: - smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); + smp_call_function_single(cpu, nmi_cpu_up, NULL, 0); break; case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); + smp_call_function_single(cpu, nmi_cpu_down, NULL, 1); break; } return NOTIFY_DONE; @@ -476,7 +470,75 @@ static int oprofile_cpu_notifier(struct static struct notifier_block oprofile_cpu_nb = { .notifier_call = oprofile_cpu_notifier }; -#endif + +static int nmi_setup(void) +{ + int err = 0; + int cpu; + + if (!allocate_msrs()) + return -ENOMEM; + + /* We need to serialize save and setup for HT because the subset + * of msrs are distinct for save and setup operations + */ + + /* Assume saved/restored counters are the same on all CPUs */ + err = model->fill_in_addresses(&per_cpu(cpu_msrs, 0)); + if (err) + goto fail; + + for_each_possible_cpu(cpu) { + if (!cpu) + continue; + + memcpy(per_cpu(cpu_msrs, cpu).counters, + per_cpu(cpu_msrs, 0).counters, + sizeof(struct op_msr) * model->num_counters); + + memcpy(per_cpu(cpu_msrs, cpu).controls, + per_cpu(cpu_msrs, 0).controls, + sizeof(struct op_msr) * model->num_controls); + + mux_clone(cpu); + } + + nmi_enabled = 0; + ctr_running = 0; + barrier(); + err = register_die_notifier(&profile_exceptions_nb); + if (err) + goto fail; + + get_online_cpus(); + register_cpu_notifier(&oprofile_cpu_nb); + on_each_cpu(nmi_cpu_setup, NULL, 1); + nmi_enabled = 1; + put_online_cpus(); + + return 0; +fail: + free_msrs(); + return err; +} + +static void nmi_shutdown(void) +{ + struct op_msrs *msrs; + + get_online_cpus(); + unregister_cpu_notifier(&oprofile_cpu_nb); + on_each_cpu(nmi_cpu_shutdown, NULL, 1); + nmi_enabled = 0; + ctr_running = 0; + put_online_cpus(); + barrier(); + unregister_die_notifier(&profile_exceptions_nb); + msrs = &get_cpu_var(cpu_msrs); + model->shutdown(msrs); + free_msrs(); + put_cpu_var(cpu_msrs); +} #ifdef CONFIG_PM @@ -511,8 +573,13 @@ static int __init init_sysfs(void) int error; error = sysdev_class_register(&oprofile_sysclass); - if (!error) - error = sysdev_register(&device_oprofile); + if (error) + return error; + + error = sysdev_register(&device_oprofile); + if (error) + sysdev_class_unregister(&oprofile_sysclass); + return error; } @@ -523,8 +590,10 @@ static void exit_sysfs(void) } #else -#define init_sysfs() do { } while (0) -#define exit_sysfs() do { } while (0) + +static inline int init_sysfs(void) { return 0; } +static inline void exit_sysfs(void) { } + #endif /* CONFIG_PM */ static int __init p4_init(char **cpu_type) @@ -577,6 +646,18 @@ static int __init ppro_init(char **cpu_t if (force_arch_perfmon && cpu_has_arch_perfmon) return 0; + /* + * Documentation on identifying Intel processors by CPU family + * and model can be found in the Intel Software Developer's + * Manuals (SDM): + * + * http://www.intel.com/products/processor/manuals/ + * + * As of May 2010 the documentation for this was in the: + * "Intel 64 and IA-32 Architectures Software Developer's + * Manual Volume 3B: System Programming Guide", "Table B-1 + * CPUID Signature Values of DisplayFamily_DisplayModel". + */ switch (cpu_model) { case 0 ... 2: *cpu_type = "i386/ppro"; @@ -595,14 +676,19 @@ static int __init ppro_init(char **cpu_t case 14: *cpu_type = "i386/core"; break; - case 15: case 23: + case 0x0f: + case 0x16: + case 0x17: + case 0x1d: *cpu_type = "i386/core_2"; break; - case 26: + case 0x1a: + case 0x1e: + case 0x2e: spec = &op_arch_perfmon_spec; *cpu_type = "i386/core_i7"; break; - case 28: + case 0x1c: *cpu_type = "i386/atom"; break; default: @@ -614,9 +700,6 @@ static int __init ppro_init(char **cpu_t return 1; } -/* in order to get sysfs right */ -static int using_nmi; - int __init op_nmi_init(struct oprofile_operations *ops) { __u8 vendor = boot_cpu_data.x86_vendor; @@ -648,6 +731,15 @@ int __init op_nmi_init(struct oprofile_o case 0x11: cpu_type = "x86-64/family11h"; break; + case 0x12: + cpu_type = "x86-64/family12h"; + break; + case 0x14: + cpu_type = "x86-64/family14h"; + break; + case 0x15: + cpu_type = "x86-64/family15h"; + break; default: return -ENODEV; } @@ -685,9 +777,6 @@ int __init op_nmi_init(struct oprofile_o return -ENODEV; } -#ifdef CONFIG_SMP - register_cpu_notifier(&oprofile_cpu_nb); -#endif /* default values, can be overwritten by model */ ops->create_files = nmi_create_files; ops->setup = nmi_setup; @@ -706,20 +795,15 @@ int __init op_nmi_init(struct oprofile_o mux_init(ops); - init_sysfs(); - using_nmi = 1; + ret = init_sysfs(); + if (ret) + return ret; + printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; } void op_nmi_exit(void) { - if (using_nmi) { - exit_sysfs(); -#ifdef CONFIG_SMP - unregister_cpu_notifier(&oprofile_cpu_nb); -#endif - } - if (model->exit) - model->exit(); + exit_sysfs(); } diff -uprN linux-2.6.32/arch/x86/oprofile/nmi_timer_int.c linux-2.6.32.ovz/arch/x86/oprofile/nmi_timer_int.c --- linux-2.6.32/arch/x86/oprofile/nmi_timer_int.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/nmi_timer_int.c 2015-03-10 13:22:31.000000000 -0700 @@ -38,7 +38,7 @@ static int profile_timer_exceptions_noti static struct notifier_block profile_timer_exceptions_nb = { .notifier_call = profile_timer_exceptions_notify, .next = NULL, - .priority = 0 + .priority = NMI_LOW_PRIOR, }; static int timer_start(void) diff -uprN linux-2.6.32/arch/x86/oprofile/op_counter.h linux-2.6.32.ovz/arch/x86/oprofile/op_counter.h --- linux-2.6.32/arch/x86/oprofile/op_counter.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/op_counter.h 2015-03-10 13:22:30.000000000 -0700 @@ -22,6 +22,7 @@ struct op_counter_config { unsigned long kernel; unsigned long user; unsigned long unit_mask; + unsigned long extra; }; extern struct op_counter_config counter_config[]; diff -uprN linux-2.6.32/arch/x86/oprofile/op_model_amd.c linux-2.6.32.ovz/arch/x86/oprofile/op_model_amd.c --- linux-2.6.32/arch/x86/oprofile/op_model_amd.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/op_model_amd.c 2015-03-10 13:24:14.000000000 -0700 @@ -22,18 +22,17 @@ #include #include #include +#include +#include +#include #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 4 -#define NUM_CONTROLS 4 #ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX -#define NUM_VIRT_COUNTERS 32 -#define NUM_VIRT_CONTROLS 32 +#define NUM_VIRT_COUNTERS 32 #else -#define NUM_VIRT_COUNTERS NUM_COUNTERS -#define NUM_VIRT_CONTROLS NUM_CONTROLS +#define NUM_VIRT_COUNTERS 0 #endif #define OP_EVENT_MASK 0x0FFF @@ -41,152 +40,88 @@ #define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) -static unsigned long reset_value[NUM_VIRT_COUNTERS]; - -#ifdef CONFIG_OPROFILE_IBS - -/* IbsFetchCtl bits/masks */ -#define IBS_FETCH_RAND_EN (1ULL<<57) -#define IBS_FETCH_VAL (1ULL<<49) -#define IBS_FETCH_ENABLE (1ULL<<48) -#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL - -/*IbsOpCtl bits */ -#define IBS_OP_CNT_CTL (1ULL<<19) -#define IBS_OP_VAL (1ULL<<18) -#define IBS_OP_ENABLE (1ULL<<17) +static int num_counters; +static unsigned long reset_value[OP_MAX_COUNTER]; #define IBS_FETCH_SIZE 6 #define IBS_OP_SIZE 12 -static int has_ibs; /* AMD Family10h and later */ +static u32 ibs_caps; -struct op_ibs_config { +struct ibs_config { unsigned long op_enabled; unsigned long fetch_enabled; unsigned long max_cnt_fetch; unsigned long max_cnt_op; unsigned long rand_en; unsigned long dispatched_ops; + unsigned long branch_target; }; -static struct op_ibs_config ibs_config; - -#endif - -#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX - -static void op_mux_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < NUM_VIRT_COUNTERS; i++) { - int hw_counter = op_x86_virt_to_phys(i); - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; - else - msrs->multiplex[i].addr = 0; - } -} - -static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } -} - -#else - -static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { } - -#endif - -/* functions for op_amd_spec */ - -static void op_amd_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < NUM_COUNTERS; i++) { - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; - } - - for (i = 0; i < NUM_CONTROLS; i++) { - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; - } - - op_mux_fill_in_addresses(msrs); -} - -static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, - struct op_msrs const * const msrs) -{ - u64 val; - int i; - - /* setup reset_value */ - for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { - if (counter_config[i].enabled) - reset_value[i] = counter_config[i].count; - else - reset_value[i] = 0; - } - - /* clear all counters */ - for (i = 0; i < NUM_CONTROLS; ++i) { - if (unlikely(!msrs->controls[i].addr)) - continue; - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - wrmsrl(msrs->controls[i].addr, val); - } - - /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!msrs->counters[i].addr)) - continue; - wrmsrl(msrs->counters[i].addr, -1LL); - } +struct ibs_state { + u64 ibs_op_ctl; + int branch_target; + unsigned long sample_size; +}; - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - int virt = op_x86_phys_to_virt(i); - if (!counter_config[virt].enabled) - continue; - if (!msrs->counters[i].addr) - continue; +static struct ibs_config ibs_config; +static struct ibs_state ibs_state; - /* setup counter registers */ - wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); +/* + * 16-bit Linear Feedback Shift Register (LFSR) + * + * 16 14 13 11 + * Feedback polynomial = X + X + X + X + 1 + */ +static unsigned int lfsr_random(void) +{ + static unsigned int lfsr_value = 0xF00D; + unsigned int bit; + + /* Compute next bit to shift in */ + bit = ((lfsr_value >> 0) ^ + (lfsr_value >> 2) ^ + (lfsr_value >> 3) ^ + (lfsr_value >> 5)) & 0x0001; + + /* Advance to next register value */ + lfsr_value = (lfsr_value >> 1) | (bit << 15); + + return lfsr_value; +} + +/* + * IBS software randomization + * + * The IBS periodic op counter is randomized in software. The lower 12 + * bits of the 20 bit counter are randomized. IbsOpCurCnt is + * initialized with a 12 bit random value. + */ +static inline u64 op_amd_randomize_ibs_op(u64 val) +{ + unsigned int random = lfsr_random(); + + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) + /* + * Work around if the hw can not write to IbsOpCurCnt + * + * Randomize the lower 8 bits of the 16 bit + * IbsOpMaxCnt [15:0] value in the range of -128 to + * +127 by adding/subtracting an offset to the + * maximum count (IbsOpMaxCnt). + * + * To avoid over or underflows and protect upper bits + * starting at bit 16, the initial value for + * IbsOpMaxCnt must fit in the range from 0x0081 to + * 0xff80. + */ + val += (s8)(random >> 4); + else + val |= (u64)(random & IBS_RANDOM_MASK) << 32; - /* setup control registers */ - rdmsrl(msrs->controls[i].addr, val); - val &= model->reserved; - val |= op_x86_get_ctrl(model, &counter_config[virt]); - wrmsrl(msrs->controls[i].addr, val); - } + return val; } -#ifdef CONFIG_OPROFILE_IBS - static inline void op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) @@ -194,7 +129,7 @@ op_amd_handle_ibs(struct pt_regs * const u64 val, ctl; struct op_entry entry; - if (!has_ibs) + if (!ibs_caps) return; if (ibs_config.fetch_enabled) { @@ -210,7 +145,7 @@ op_amd_handle_ibs(struct pt_regs * const oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); + ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT); ctl |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); } @@ -220,8 +155,8 @@ op_amd_handle_ibs(struct pt_regs * const rdmsrl(MSR_AMD64_IBSOPCTL, ctl); if (ctl & IBS_OP_VAL) { rdmsrl(MSR_AMD64_IBSOPRIP, val); - oprofile_write_reserve(&entry, regs, val, - IBS_OP_CODE, IBS_OP_SIZE); + oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE, + ibs_state.sample_size); oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSOPDATA, val); oprofile_add_data64(&entry, val); @@ -233,11 +168,14 @@ op_amd_handle_ibs(struct pt_regs * const oprofile_add_data64(&entry, val); rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); oprofile_add_data64(&entry, val); + if (ibs_state.branch_target) { + rdmsrl(MSR_AMD64_IBSBRTARGET, val); + oprofile_add_data(&entry, (unsigned long)val); + } oprofile_write_commit(&entry); /* reenable the IRQ */ - ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; - ctl |= IBS_OP_ENABLE; + ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, ctl); } } @@ -246,48 +184,274 @@ op_amd_handle_ibs(struct pt_regs * const static inline void op_amd_start_ibs(void) { u64 val; - if (has_ibs && ibs_config.fetch_enabled) { - val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + + if (!ibs_caps) + return; + + memset(&ibs_state, 0, sizeof(ibs_state)); + + /* + * Note: Since the max count settings may out of range we + * write back the actual used values so that userland can read + * it. + */ + + if (ibs_config.fetch_enabled) { + val = ibs_config.max_cnt_fetch >> 4; + val = min(val, IBS_FETCH_MAX_CNT); + ibs_config.max_cnt_fetch = val << 4; val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; val |= IBS_FETCH_ENABLE; wrmsrl(MSR_AMD64_IBSFETCHCTL, val); } - if (has_ibs && ibs_config.op_enabled) { - val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; + if (ibs_config.op_enabled) { + val = ibs_config.max_cnt_op >> 4; + if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { + /* + * IbsOpCurCnt not supported. See + * op_amd_randomize_ibs_op() for details. + */ + val = clamp(val, 0x0081ULL, 0xFF80ULL); + ibs_config.max_cnt_op = val << 4; + } else { + /* + * The start value is randomized with a + * positive offset, we need to compensate it + * with the half of the randomized range. Also + * avoid underflows. + */ + val += IBS_RANDOM_MAXCNT_OFFSET; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + val = min(val, IBS_OP_MAX_CNT_EXT); + else + val = min(val, IBS_OP_MAX_CNT); + ibs_config.max_cnt_op = + (val - IBS_RANDOM_MAXCNT_OFFSET) << 4; + } + val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT); val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; val |= IBS_OP_ENABLE; + ibs_state.ibs_op_ctl = val; + ibs_state.sample_size = IBS_OP_SIZE; + if (ibs_config.branch_target) { + ibs_state.branch_target = 1; + ibs_state.sample_size++; + } + val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl); wrmsrl(MSR_AMD64_IBSOPCTL, val); } } static void op_amd_stop_ibs(void) { - if (has_ibs && ibs_config.fetch_enabled) + if (!ibs_caps) + return; + + if (ibs_config.fetch_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); - if (has_ibs && ibs_config.op_enabled) + if (ibs_config.op_enabled) /* clear max count and enable */ wrmsrl(MSR_AMD64_IBSOPCTL, 0); } -#else +static inline int eilvt_is_available(int offset) +{ + /* check if we may assign a vector */ + return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1); +} + +static inline int ibs_eilvt_valid(void) +{ + int offset; + u64 val; + + rdmsrl(MSR_AMD64_IBSCTL, val); + offset = val & IBSCTL_LVT_OFFSET_MASK; + + if (!(val & IBSCTL_LVT_OFFSET_VALID)) { + pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + return 0; + } + + if (!eilvt_is_available(offset)) { + pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n", + smp_processor_id(), offset, MSR_AMD64_IBSCTL, val); + return 0; + } + + return 1; +} + +static inline int get_ibs_offset(void) +{ + u64 val; -static inline void op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) { } -static inline void op_amd_start_ibs(void) { } -static inline void op_amd_stop_ibs(void) { } + rdmsrl(MSR_AMD64_IBSCTL, val); + if (!(val & IBSCTL_LVT_OFFSET_VALID)) + return -EINVAL; + + return val & IBSCTL_LVT_OFFSET_MASK; +} + +static void setup_APIC_ibs(void) +{ + int offset; + + offset = get_ibs_offset(); + if (offset < 0) + goto failed; + + if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0)) + return; +failed: + pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n", + smp_processor_id()); +} + +static void clear_APIC_ibs(void) +{ + int offset; + + offset = get_ibs_offset(); + if (offset >= 0) + setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1); +} + +#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX + +static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* enable active counters */ + for (i = 0; i < num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) + continue; + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } +} #endif +/* functions for op_amd_spec */ + +static void op_amd_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < num_counters; ++i) { + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} + +static int op_amd_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < num_counters; i++) { + if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + goto fail; + if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + goto fail; + } + /* both registers must be reserved */ + if (num_counters == AMD64_NUM_COUNTERS_CORE) { + msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1); + msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1); + } else { + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + } + continue; + fail: + if (!counter_config[i].enabled) + continue; + op_x86_warn_reserved(i); + op_amd_shutdown(msrs); + return -EBUSY; + } + + return 0; +} + +static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, + struct op_msrs const * const msrs) +{ + u64 val; + int i; + + /* setup reset_value */ + for (i = 0; i < OP_MAX_COUNTER; ++i) { + if (counter_config[i].enabled + && msrs->counters[op_x86_virt_to_phys(i)].addr) + reset_value[i] = counter_config[i].count; + else + reset_value[i] = 0; + } + + /* clear all counters */ + for (i = 0; i < num_counters; ++i) { + if (!msrs->controls[i].addr) + continue; + rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + op_x86_warn_in_use(i); + val &= model->reserved; + wrmsrl(msrs->controls[i].addr, val); + /* + * avoid a false detection of ctr overflows in NMI + * handler + */ + wrmsrl(msrs->counters[i].addr, -1LL); + } + + /* enable active counters */ + for (i = 0; i < num_counters; ++i) { + int virt = op_x86_phys_to_virt(i); + if (!reset_value[virt]) + continue; + + /* setup counter registers */ + wrmsrl(msrs->counters[i].addr, -(u64)reset_value[virt]); + + /* setup control registers */ + rdmsrl(msrs->controls[i].addr, val); + val &= model->reserved; + val |= op_x86_get_ctrl(model, &counter_config[virt]); + wrmsrl(msrs->controls[i].addr, val); + } + + if (ibs_caps) + setup_APIC_ibs(); +} + +static void op_amd_cpu_shutdown(void) +{ + if (ibs_caps) + clear_APIC_ibs(); +} + static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { u64 val; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { int virt = op_x86_phys_to_virt(i); if (!reset_value[virt]) continue; @@ -310,11 +474,11 @@ static void op_amd_start(struct op_msrs u64 val; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } @@ -330,56 +494,23 @@ static void op_amd_stop(struct op_msrs c * Subtle: stop on all counters to avoid race with setting our * pm callback */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[op_x86_phys_to_virt(i)]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } op_amd_stop_ibs(); } -static void op_amd_shutdown(struct op_msrs const * const msrs) +static int setup_ibs_ctl(int ibs_eilvt_off) { - int i; - - for (i = 0; i < NUM_COUNTERS; ++i) { - if (msrs->counters[i].addr) - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - } - for (i = 0; i < NUM_CONTROLS; ++i) { - if (msrs->controls[i].addr) - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - -#ifdef CONFIG_OPROFILE_IBS - -static u8 ibs_eilvt_off; - -static inline void apic_init_ibs_nmi_per_cpu(void *arg) -{ - ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); -} - -static inline void apic_clear_ibs_nmi_per_cpu(void *arg) -{ - setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); -} - -static int init_ibs_nmi(void) -{ -#define IBSCTL_LVTOFFSETVAL (1 << 8) -#define IBSCTL 0x1cc struct pci_dev *cpu_cfg; int nodes; u32 value = 0; - /* per CPU setup */ - on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); - nodes = 0; cpu_cfg = NULL; do { @@ -390,63 +521,79 @@ static int init_ibs_nmi(void) break; ++nodes; pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off - | IBSCTL_LVTOFFSETVAL); + | IBSCTL_LVT_OFFSET_VALID); pci_read_config_dword(cpu_cfg, IBSCTL, &value); - if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) { pci_dev_put(cpu_cfg); printk(KERN_DEBUG "Failed to setup IBS LVT offset, " - "IBSCTL = 0x%08x", value); - return 1; + "IBSCTL = 0x%08x\n", value); + return -EINVAL; } } while (1); if (!nodes) { - printk(KERN_DEBUG "No CPU node configured for IBS"); - return 1; + printk(KERN_DEBUG "No CPU node configured for IBS\n"); + return -ENODEV; } -#ifdef CONFIG_NUMA - /* Sanity check */ - /* Works only for 64bit with proper numa implementation. */ - if (nodes != num_possible_nodes()) { - printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " - "found: %d, expected %d", - nodes, num_possible_nodes()); - return 1; - } -#endif return 0; } -/* uninitialize the APIC for the IBS interrupts if needed */ -static void clear_ibs_nmi(void) +static int force_ibs_eilvt_setup(void) { - if (has_ibs) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); + int i; + int ret; + + /* find the next free available EILVT entry */ + for (i = 1; i < 4; i++) { + if (!eilvt_is_available(i)) + continue; + ret = setup_ibs_ctl(i); + if (ret) + return ret; + pr_err(FW_BUG "using offset %d for IBS interrupts\n", i); + return 0; + } + + printk(KERN_DEBUG "No EILVT entry available\n"); + + return -EBUSY; } -/* initialize the APIC for the IBS interrupts if available */ -static void ibs_init(void) +static int __init_ibs_nmi(void) { - has_ibs = boot_cpu_has(X86_FEATURE_IBS); + int ret; - if (!has_ibs) - return; + if (ibs_eilvt_valid()) + return 0; - if (init_ibs_nmi()) { - has_ibs = 0; - return; - } + ret = force_ibs_eilvt_setup(); + if (ret) + return ret; + + if (!ibs_eilvt_valid()) + return -EFAULT; - printk(KERN_INFO "oprofile: AMD IBS detected\n"); + pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); + + return 0; } -static void ibs_exit(void) +/* initialize the APIC for the IBS interrupts if available */ +static void init_ibs(void) { - if (!has_ibs) + ibs_caps = get_ibs_caps(); + + if (!ibs_caps) return; - clear_ibs_nmi(); + if (__init_ibs_nmi()) { + ibs_caps = 0; + return; + } + + printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", + (unsigned)ibs_caps); } static int (*create_arch_files)(struct super_block *sb, struct dentry *root); @@ -463,73 +610,72 @@ static int setup_ibs_files(struct super_ if (ret) return ret; - if (!has_ibs) + if (!ibs_caps) return ret; /* model specific files */ /* setup some reasonable defaults */ + memset(&ibs_config, 0, sizeof(ibs_config)); ibs_config.max_cnt_fetch = 250000; - ibs_config.fetch_enabled = 0; ibs_config.max_cnt_op = 250000; - ibs_config.op_enabled = 0; - ibs_config.dispatched_ops = 1; - dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); - oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.fetch_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_fetch); - oprofilefs_create_ulong(sb, dir, "rand_enable", - &ibs_config.rand_en); - - dir = oprofilefs_mkdir(sb, root, "ibs_op"); - oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.op_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_op); - oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); + if (ibs_caps & IBS_CAPS_FETCHSAM) { + dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_fetch); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + } + + if (ibs_caps & IBS_CAPS_OPSAM) { + dir = oprofilefs_mkdir(sb, root, "ibs_op"); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_op); + if (ibs_caps & IBS_CAPS_OPCNT) + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); + if (ibs_caps & IBS_CAPS_BRNTRGT) + oprofilefs_create_ulong(sb, dir, "branch_target", + &ibs_config.branch_target); + } return 0; } +struct op_x86_model_spec op_amd_spec; + static int op_amd_init(struct oprofile_operations *ops) { - ibs_init(); + init_ibs(); create_arch_files = ops->create_files; ops->create_files = setup_ibs_files; - return 0; -} - -static void op_amd_exit(void) -{ - ibs_exit(); -} -#else + if (boot_cpu_data.x86 == 0x15) { + num_counters = AMD64_NUM_COUNTERS_CORE; + } else { + num_counters = AMD64_NUM_COUNTERS; + } -/* no IBS support */ + op_amd_spec.num_counters = num_counters; + op_amd_spec.num_controls = num_counters; + op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS); -static int op_amd_init(struct oprofile_operations *ops) -{ return 0; } -static void op_amd_exit(void) {} - -#endif /* CONFIG_OPROFILE_IBS */ - struct op_x86_model_spec op_amd_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .num_virt_counters = NUM_VIRT_COUNTERS, + /* num_counters/num_controls filled in at runtime */ .reserved = MSR_AMD_EVENTSEL_RESERVED, .event_mask = OP_EVENT_MASK, .init = op_amd_init, - .exit = op_amd_exit, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, + .cpu_down = &op_amd_cpu_shutdown, .check_ctrs = &op_amd_check_ctrs, .start = &op_amd_start, .stop = &op_amd_stop, diff -uprN linux-2.6.32/arch/x86/oprofile/op_model_p4.c linux-2.6.32.ovz/arch/x86/oprofile/op_model_p4.c --- linux-2.6.32/arch/x86/oprofile/op_model_p4.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/op_model_p4.c 2015-03-10 13:22:31.000000000 -0700 @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -385,8 +385,26 @@ static unsigned int get_stagger(void) static unsigned long reset_value[NUM_COUNTERS_NON_HT]; +static void p4_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0; i < num_counters; ++i) { + if (msrs->counters[i].addr) + release_perfctr_nmi(msrs->counters[i].addr); + } + /* + * some of the control registers are specially reserved in + * conjunction with the counter registers (hence the starting offset). + * This saves a few bits. + */ + for (i = num_counters; i < num_controls; ++i) { + if (msrs->controls[i].addr) + release_evntsel_nmi(msrs->controls[i].addr); + } +} -static void p4_fill_in_addresses(struct op_msrs * const msrs) +static int p4_fill_in_addresses(struct op_msrs * const msrs) { unsigned int i; unsigned int addr, cccraddr, stag; @@ -394,12 +412,6 @@ static void p4_fill_in_addresses(struct setup_num_counters(); stag = get_stagger(); - /* initialize some registers */ - for (i = 0; i < num_counters; ++i) - msrs->counters[i].addr = 0; - for (i = 0; i < num_controls; ++i) - msrs->controls[i].addr = 0; - /* the counter & cccr registers we pay attention to */ for (i = 0; i < num_counters; ++i) { addr = p4_counters[VIRT_CTR(stag, i)].counter_address; @@ -474,6 +486,18 @@ static void p4_fill_in_addresses(struct msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; } } + + for (i = 0; i < num_counters; ++i) { + if (!counter_config[i].enabled) + continue; + if (msrs->controls[i].addr) + continue; + op_x86_warn_reserved(i); + p4_shutdown(msrs); + return -EBUSY; + } + + return 0; } @@ -674,26 +698,6 @@ static void p4_stop(struct op_msrs const } } -static void p4_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (msrs->counters[i].addr) - release_perfctr_nmi(msrs->counters[i].addr); - } - /* - * some of the control registers are specially reserved in - * conjunction with the counter registers (hence the starting offset). - * This saves a few bits. - */ - for (i = num_counters; i < num_controls; ++i) { - if (msrs->controls[i].addr) - release_evntsel_nmi(msrs->controls[i].addr); - } -} - - #ifdef CONFIG_SMP struct op_x86_model_spec op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, diff -uprN linux-2.6.32/arch/x86/oprofile/op_model_ppro.c linux-2.6.32.ovz/arch/x86/oprofile/op_model_ppro.c --- linux-2.6.32/arch/x86/oprofile/op_model_ppro.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/op_model_ppro.c 2015-03-10 13:21:53.000000000 -0700 @@ -30,23 +30,46 @@ static int counter_width = 32; static u64 *reset_value; -static void ppro_fill_in_addresses(struct op_msrs * const msrs) +static void ppro_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0; i < num_counters; i++) { - if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; + for (i = 0; i < num_counters; ++i) { + if (!msrs->counters[i].addr) + continue; + release_perfctr_nmi(MSR_P6_PERFCTR0 + i); + release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); + } + if (reset_value) { + kfree(reset_value); + reset_value = NULL; } +} + +static int ppro_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; for (i = 0; i < num_counters; i++) { - if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; + if (!reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) + goto fail; + if (!reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) { + release_perfctr_nmi(MSR_P6_PERFCTR0 + i); + goto fail; + } + /* both registers must be reserved */ + msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; + msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; + continue; + fail: + if (!counter_config[i].enabled) + continue; + op_x86_warn_reserved(i); + ppro_shutdown(msrs); + return -EBUSY; } + + return 0; } @@ -57,7 +80,7 @@ static void ppro_setup_ctrs(struct op_x8 int i; if (!reset_value) { - reset_value = kmalloc(sizeof(reset_value[0]) * num_counters, + reset_value = kzalloc(sizeof(reset_value[0]) * num_counters, GFP_ATOMIC); if (!reset_value) return; @@ -82,17 +105,17 @@ static void ppro_setup_ctrs(struct op_x8 /* clear all counters */ for (i = 0; i < num_counters; ++i) { - if (unlikely(!msrs->controls[i].addr)) + if (!msrs->controls[i].addr) continue; rdmsrl(msrs->controls[i].addr, val); + if (val & ARCH_PERFMON_EVENTSEL_ENABLE) + op_x86_warn_in_use(i); val &= model->reserved; wrmsrl(msrs->controls[i].addr, val); - } - - /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < num_counters; ++i) { - if (unlikely(!msrs->counters[i].addr)) - continue; + /* + * avoid a false detection of ctr overflows in NMI * + * handler + */ wrmsrl(msrs->counters[i].addr, -1LL); } @@ -161,7 +184,7 @@ static void ppro_start(struct op_msrs co for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { rdmsrl(msrs->controls[i].addr, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + val |= ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } @@ -179,30 +202,11 @@ static void ppro_stop(struct op_msrs con if (!reset_value[i]) continue; rdmsrl(msrs->controls[i].addr, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; wrmsrl(msrs->controls[i].addr, val); } } -static void ppro_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0; i < num_counters; ++i) { - if (msrs->counters[i].addr) - release_perfctr_nmi(MSR_P6_PERFCTR0 + i); - } - for (i = 0; i < num_counters; ++i) { - if (msrs->controls[i].addr) - release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); - } - if (reset_value) { - kfree(reset_value); - reset_value = NULL; - } -} - - struct op_x86_model_spec op_ppro_spec = { .num_counters = 2, .num_controls = 2, @@ -234,11 +238,11 @@ static void arch_perfmon_setup_counters( if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { eax.split.version_id = 2; - eax.split.num_events = 2; + eax.split.num_counters = 2; eax.split.bit_width = 40; } - num_counters = eax.split.num_events; + num_counters = eax.split.num_counters; op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; diff -uprN linux-2.6.32/arch/x86/oprofile/op_x86_model.h linux-2.6.32.ovz/arch/x86/oprofile/op_x86_model.h --- linux-2.6.32/arch/x86/oprofile/op_x86_model.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/oprofile/op_x86_model.h 2015-03-10 13:21:53.000000000 -0700 @@ -40,10 +40,10 @@ struct op_x86_model_spec { u64 reserved; u16 event_mask; int (*init)(struct oprofile_operations *ops); - void (*exit)(void); - void (*fill_in_addresses)(struct op_msrs * const msrs); + int (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_x86_model_spec const *model, struct op_msrs const * const msrs); + void (*cpu_down)(void); int (*check_ctrs)(struct pt_regs * const regs, struct op_msrs const * const msrs); void (*start)(struct op_msrs const * const msrs); @@ -57,6 +57,26 @@ struct op_x86_model_spec { struct op_counter_config; +static inline void op_x86_warn_in_use(int counter) +{ + /* + * The warning indicates an already running counter. If + * oprofile doesn't collect data, then try using a different + * performance counter on your platform to monitor the desired + * event. Delete counter #%d from the desired event by editing + * the /usr/share/oprofile/%s//events file. If the event + * cannot be monitored by any other counter, contact your + * hardware or BIOS vendor. + */ + pr_warning("oprofile: counter #%d on cpu #%d may already be used\n", + counter, smp_processor_id()); +} + +static inline void op_x86_warn_reserved(int counter) +{ + pr_warning("oprofile: counter #%d is already reserved\n", counter); +} + extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, struct op_counter_config *counter_config); extern int op_x86_phys_to_virt(int phys); diff -uprN linux-2.6.32/arch/x86/pci/acpi.c linux-2.6.32.ovz/arch/x86/pci/acpi.c --- linux-2.6.32/arch/x86/pci/acpi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/acpi.c 2015-03-10 13:24:08.000000000 -0700 @@ -7,26 +7,111 @@ #include struct pci_root_info { - char *name; + struct acpi_device *bridge; + char name[16]; unsigned int res_num; struct resource *res; - struct pci_bus *bus; - int busnum; + resource_size_t *res_offset; + struct pci_sysdata sd; +}; + +static bool pci_use_crs = true; + +static int __init set_use_crs(const struct dmi_system_id *id) +{ + pci_use_crs = true; + return 0; +} + +static const struct dmi_system_id pci_use_crs_table[] __initconst = { + /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */ + { + .callback = set_use_crs, + .ident = "IBM System x3800", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "IBM"), + DMI_MATCH(DMI_PRODUCT_NAME, "x3800"), + }, + }, + /* https://bugzilla.kernel.org/show_bug.cgi?id=16007 */ + /* 2006 AMD HT/VIA system with two host bridges */ + { + .callback = set_use_crs, + .ident = "ASRock ALiveSATA2-GLAN", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "ALiveSATA2-GLAN"), + }, + }, + {} }; +void __init pci_acpi_crs_quirks(void) +{ + int year; + + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008) + pci_use_crs = false; + + dmi_check_system(pci_use_crs_table); + + /* + * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that + * takes precedence over anything we figured out above. + */ + if (pci_probe & PCI_ROOT_NO_CRS) + pci_use_crs = false; + else if (pci_probe & PCI_USE__CRS) + pci_use_crs = true; + + printk(KERN_INFO "PCI: %s host bridge windows from ACPI; " + "if necessary, use \"pci=%s\" and report a bug\n", + pci_use_crs ? "Using" : "Ignoring", + pci_use_crs ? "nocrs" : "use_crs"); +} + static acpi_status resource_to_addr(struct acpi_resource *resource, struct acpi_resource_address64 *addr) { acpi_status status; - - status = acpi_resource_to_address64(resource, addr); - if (ACPI_SUCCESS(status) && - (addr->resource_type == ACPI_MEMORY_RANGE || - addr->resource_type == ACPI_IO_RANGE) && - addr->address_length > 0 && - addr->producer_consumer == ACPI_PRODUCER) { + struct acpi_resource_memory24 *memory24; + struct acpi_resource_memory32 *memory32; + struct acpi_resource_fixed_memory32 *fixed_memory32; + + memset(addr, 0, sizeof(*addr)); + switch (resource->type) { + case ACPI_RESOURCE_TYPE_MEMORY24: + memory24 = &resource->data.memory24; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = memory24->minimum; + addr->address_length = memory24->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; + return AE_OK; + case ACPI_RESOURCE_TYPE_MEMORY32: + memory32 = &resource->data.memory32; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = memory32->minimum; + addr->address_length = memory32->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; return AE_OK; + case ACPI_RESOURCE_TYPE_FIXED_MEMORY32: + fixed_memory32 = &resource->data.fixed_memory32; + addr->resource_type = ACPI_MEMORY_RANGE; + addr->minimum = fixed_memory32->address; + addr->address_length = fixed_memory32->address_length; + addr->maximum = addr->minimum + addr->address_length - 1; + return AE_OK; + case ACPI_RESOURCE_TYPE_ADDRESS16: + case ACPI_RESOURCE_TYPE_ADDRESS32: + case ACPI_RESOURCE_TYPE_ADDRESS64: + status = acpi_resource_to_address64(resource, addr); + if (ACPI_SUCCESS(status) && + (addr->resource_type == ACPI_MEMORY_RANGE || + addr->resource_type == ACPI_IO_RANGE) && + addr->address_length > 0) { + return AE_OK; + } + break; } return AE_ERROR; } @@ -44,20 +129,6 @@ count_resource(struct acpi_resource *acp return AE_OK; } -static int -bus_has_transparent_bridge(struct pci_bus *bus) -{ - struct pci_dev *dev; - - list_for_each_entry(dev, &bus->devices, bus_list) { - u16 class = dev->class >> 8; - - if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent) - return true; - } - return false; -} - static acpi_status setup_resource(struct acpi_resource *acpi_res, void *data) { @@ -66,37 +137,36 @@ setup_resource(struct acpi_resource *acp struct acpi_resource_address64 addr; acpi_status status; unsigned long flags; - struct resource *root; - int max_root_bus_resources = PCI_BUS_NUM_RESOURCES; - u64 start, end; - - if (bus_has_transparent_bridge(info->bus)) - max_root_bus_resources -= 3; + u64 start, orig_end, end; status = resource_to_addr(acpi_res, &addr); if (!ACPI_SUCCESS(status)) return AE_OK; if (addr.resource_type == ACPI_MEMORY_RANGE) { - root = &iomem_resource; flags = IORESOURCE_MEM; if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY) flags |= IORESOURCE_PREFETCH; } else if (addr.resource_type == ACPI_IO_RANGE) { - root = &ioport_resource; flags = IORESOURCE_IO; } else return AE_OK; start = addr.minimum + addr.translation_offset; - end = start + addr.address_length - 1; - if (info->res_num >= max_root_bus_resources) { - printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx " - "from %s for %s due to _CRS returning more than " - "%d resource descriptors\n", (unsigned long) start, - (unsigned long) end, root->name, info->name, - max_root_bus_resources); + orig_end = end = addr.maximum + addr.translation_offset; + + /* Exclude non-addressable range or non-addressable portion of range */ + end = min(end, (u64)iomem_resource.end); + if (end <= start) { + dev_info(&info->bridge->dev, + "host bridge window [%#llx-%#llx] " + "(ignored, not CPU addressable)\n", start, orig_end); return AE_OK; + } else if (orig_end != end) { + dev_info(&info->bridge->dev, + "host bridge window [%#llx-%#llx] " + "([%#llx-%#llx] ignored, not CPU addressable)\n", + start, orig_end, end + 1, orig_end); } res = &info->res[info->res_num]; @@ -104,94 +174,207 @@ setup_resource(struct acpi_resource *acp res->flags = flags; res->start = start; res->end = end; - res->child = NULL; + info->res_offset[info->res_num] = addr.translation_offset; - if (insert_resource(root, res)) { - printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " - "from %s for %s\n", (unsigned long) res->start, - (unsigned long) res->end, root->name, info->name); - } else { - info->bus->resource[info->res_num] = res; - info->res_num++; + if (!pci_use_crs) { + dev_printk(KERN_DEBUG, &info->bridge->dev, + "host bridge window %pR (ignored)\n", res); + return AE_OK; } + + info->res_num++; + if (addr.translation_offset) + dev_info(&info->bridge->dev, "host bridge window %pR " + "(PCI address [%#llx-%#llx])\n", + res, res->start - addr.translation_offset, + res->end - addr.translation_offset); + else + dev_info(&info->bridge->dev, "host bridge window %pR\n", res); + return AE_OK; } +static bool resource_contains(struct resource *res, resource_size_t point) +{ + if (res->start <= point && point <= res->end) + return true; + return false; +} + +static void coalesce_windows(struct pci_root_info *info, int type) +{ + int i, j; + struct resource *res1, *res2; + + for (i = 0; i < info->res_num; i++) { + res1 = &info->res[i]; + if (!(res1->flags & type)) + continue; + + for (j = i + 1; j < info->res_num; j++) { + res2 = &info->res[j]; + if (!(res2->flags & type)) + continue; + + /* + * I don't like throwing away windows because then + * our resources no longer match the ACPI _CRS, but + * the kernel resource tree doesn't allow overlaps. + */ + if (resource_contains(res1, res2->start) || + resource_contains(res1, res2->end) || + resource_contains(res2, res1->start) || + resource_contains(res2, res1->end)) { + res1->start = min(res1->start, res2->start); + res1->end = max(res1->end, res2->end); + dev_info(&info->bridge->dev, + "host bridge window expanded to %pR; %pR ignored\n", + res1, res2); + res2->flags = 0; + } + } + } +} + +static void add_resources(struct pci_root_info *info, + struct list_head *resources) +{ + int i; + struct resource *res, *root, *conflict; + + coalesce_windows(info, IORESOURCE_MEM); + coalesce_windows(info, IORESOURCE_IO); + + for (i = 0; i < info->res_num; i++) { + res = &info->res[i]; + + if (res->flags & IORESOURCE_MEM) + root = &iomem_resource; + else if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + continue; + + conflict = insert_resource_conflict(root, res); + if (conflict) + dev_info(&info->bridge->dev, + "ignoring host bridge window %pR (conflicts with %s %pR)\n", + res, conflict->name, conflict); + else + pci_add_resource_offset(resources, res, + info->res_offset[i]); + } +} + +static void free_pci_root_info_res(struct pci_root_info *info) +{ + kfree(info->res); + info->res = NULL; + kfree(info->res_offset); + info->res_offset = NULL; + info->res_num = 0; +} + +static void __release_pci_root_info(struct pci_root_info *info) +{ + int i; + struct resource *res; + + for (i = 0; i < info->res_num; i++) { + res = &info->res[i]; + + if (!res->parent) + continue; + + if (!(res->flags & (IORESOURCE_MEM | IORESOURCE_IO))) + continue; + + release_resource(res); + } + + free_pci_root_info_res(info); + + kfree(info); +} +static void release_pci_root_info(struct pci_host_bridge *bridge) +{ + struct pci_root_info *info = bridge->release_data; + + __release_pci_root_info(info); +} + static void -get_current_resources(struct acpi_device *device, int busnum, - int domain, struct pci_bus *bus) +probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device, + int busnum, int domain) { - struct pci_root_info info; size_t size; - info.bus = bus; - info.res_num = 0; + sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum); + info->bridge = device; + + info->res_num = 0; acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, - &info); - if (!info.res_num) + info); + if (!info->res_num) return; - size = sizeof(*info.res) * info.res_num; - info.res = kmalloc(size, GFP_KERNEL); - if (!info.res) - goto res_alloc_fail; - - info.name = kmalloc(16, GFP_KERNEL); - if (!info.name) - goto name_alloc_fail; - sprintf(info.name, "PCI Bus %04x:%02x", domain, busnum); - - info.res_num = 0; - acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, - &info); + size = sizeof(*info->res) * info->res_num; + info->res = kzalloc(size, GFP_KERNEL); + if (!info->res) { + info->res_num = 0; + return; + } - return; + size = sizeof(*info->res_offset) * info->res_num; + info->res_num = 0; + info->res_offset = kzalloc(size, GFP_KERNEL); + if (!info->res_offset) { + kfree(info->res); + info->res = NULL; + return; + } -name_alloc_fail: - kfree(info.res); -res_alloc_fail: - return; + acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, + info); } -struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int domain, int busnum) +struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) { + struct acpi_device *device = root->device; + struct pci_root_info *info = NULL; + int domain = root->segment; + int busnum = root->secondary.start; + LIST_HEAD(resources); struct pci_bus *bus; struct pci_sysdata *sd; int node; -#ifdef CONFIG_ACPI_NUMA - int pxm; -#endif if (domain && !pci_domains_supported) { - printk(KERN_WARNING "PCI: Multiple domains not supported " - "(dom %d, bus %d)\n", domain, busnum); + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (multiple domains not supported)\n", + domain, busnum); return NULL; } - node = -1; -#ifdef CONFIG_ACPI_NUMA - pxm = acpi_get_pxm(device->handle); - if (pxm >= 0) - node = pxm_to_node(pxm); - if (node != -1) - set_mp_bus_to_node(busnum, node); - else -#endif - node = get_mp_bus_to_node(busnum); + node = acpi_get_node(device->handle); + if (node == NUMA_NO_NODE) { + node = x86_pci_root_bus_node(busnum); + if (node != 0 && node != NUMA_NO_NODE) + dev_info(&device->dev, FW_BUG "no _PXM; falling back to node %d from hardware (may be inconsistent with ACPI node numbers)\n", + node); + } - if (node != -1 && !node_online(node)) - node = -1; + if (node != NUMA_NO_NODE && !node_online(node)) + node = NUMA_NO_NODE; - /* Allocate per-root-bus (not per bus) arch-specific data. - * TODO: leak; this memory is never freed. - * It's arguable whether it's worth the trouble to care. - */ - sd = kzalloc(sizeof(*sd), GFP_KERNEL); - if (!sd) { - printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + printk(KERN_WARNING "pci_bus %04x:%02x: " + "ignored (out of memory)\n", domain, busnum); return NULL; } + sd = &info->sd; sd->domain = domain; sd->node = node; /* @@ -205,30 +388,52 @@ struct pci_bus * __devinit pci_acpi_scan * be replaced by sd. */ memcpy(bus->sysdata, sd, sizeof(*sd)); - kfree(sd); + kfree(info); } else { - bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); + probe_pci_root_info(info, device, busnum, domain); + + /* + * _CRS with no apertures is normal, so only fall back to + * defaults or native bridge info if we're ignoring _CRS. + */ + if (pci_use_crs) + add_resources(info, &resources); + else { + free_pci_root_info_res(info); + x86_pci_root_bus_resources(busnum, &resources); + } + + bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, + &resources); if (bus) { - if (pci_probe & PCI_USE__CRS) - get_current_resources(device, busnum, domain, - bus); - bus->subordinate = pci_scan_child_bus(bus); + pci_scan_child_bus(bus); + bus->subordinate = root->secondary.end; + pci_set_host_bridge_release( + to_pci_host_bridge(bus->bridge), + release_pci_root_info, info); + } else { + pci_free_resource_list(&resources); + __release_pci_root_info(info); } } - if (!bus) - kfree(sd); + /* After the PCI-E bus has been walked and all devices discovered, + * configure any settings of the fabric that might be necessary. + */ + if (bus) { + struct pci_bus *child; + list_for_each_entry(child, &bus->children, node) { + struct pci_dev *self = child->self; + if (!self) + continue; - if (bus && node != -1) { -#ifdef CONFIG_ACPI_NUMA - if (pxm >= 0) - dev_printk(KERN_DEBUG, &bus->dev, - "on NUMA node %d (pxm %d)\n", node, pxm); -#else - dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node); -#endif + pcie_bus_configure_settings(child, rh_get_mpss(self)); + } } + if (bus && node != NUMA_NO_NODE) + dev_printk(KERN_DEBUG, &bus->dev, "on NUMA node %d\n", node); + return bus; } diff -uprN linux-2.6.32/arch/x86/pci/amd_bus.c linux-2.6.32.ovz/arch/x86/pci/amd_bus.c --- linux-2.6.32/arch/x86/pci/amd_bus.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/amd_bus.c 2015-03-10 13:24:08.000000000 -0700 @@ -2,266 +2,101 @@ #include #include #include +#include + +#include #include -#ifdef CONFIG_X86_64 #include -#include -#include -#endif - -/* - * This discovers the pcibus <-> node mapping on AMD K8. - * also get peer root bus resource for io,mmio - */ - -#ifdef CONFIG_X86_64 - -/* - * sub bus (transparent) will use entres from 3 to store extra from root, - * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES? - */ -#define RES_NUM 16 -struct pci_root_info { - char name[12]; - unsigned int res_num; - struct resource res[RES_NUM]; - int bus_min; - int bus_max; - int node; - int link; -}; - -/* 4 at this time, it may become to 32 */ -#define PCI_ROOT_NR 4 -static int pci_root_num; -static struct pci_root_info pci_root_info[PCI_ROOT_NR]; - -void x86_pci_root_bus_res_quirks(struct pci_bus *b) -{ - int i; - int j; - struct pci_root_info *info; - - /* don't go for it if _CRS is used already */ - if (b->resource[0] != &ioport_resource || - b->resource[1] != &iomem_resource) - return; - - /* if only one root bus, don't need to anything */ - if (pci_root_num < 2) - return; - - for (i = 0; i < pci_root_num; i++) { - if (pci_root_info[i].bus_min == b->number) - break; - } - - if (i == pci_root_num) - return; - - printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n", - b->number); - - info = &pci_root_info[i]; - for (j = 0; j < info->res_num; j++) { - struct resource *res; - struct resource *root; - - res = &info->res[j]; - b->resource[j] = res; - if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - else - root = &iomem_resource; - insert_resource(root, res); - } -} - -#define RANGE_NUM 16 - -struct res_range { - size_t start; - size_t end; -}; - -static void __init update_range(struct res_range *range, size_t start, - size_t end) -{ - int i; - int j; - - for (j = 0; j < RANGE_NUM; j++) { - if (!range[j].end) - continue; - - if (start <= range[j].start && end >= range[j].end) { - range[j].start = 0; - range[j].end = 0; - continue; - } - - if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) { - range[j].start = end + 1; - continue; - } - - - if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) { - range[j].end = start - 1; - continue; - } - - if (start > range[j].start && end < range[j].end) { - /* find the new spare */ - for (i = 0; i < RANGE_NUM; i++) { - if (range[i].end == 0) - break; - } - if (i < RANGE_NUM) { - range[i].end = range[j].end; - range[i].start = end + 1; - } else { - printk(KERN_ERR "run of slot in ranges\n"); - } - range[j].end = start - 1; - continue; - } - } -} - -static void __init update_res(struct pci_root_info *info, size_t start, - size_t end, unsigned long flags, int merge) -{ - int i; - struct resource *res; - - if (!merge) - goto addit; - /* try to merge it with old one */ - for (i = 0; i < info->res_num; i++) { - size_t final_start, final_end; - size_t common_start, common_end; +#include "bus_numa.h" - res = &info->res[i]; - if (res->flags != flags) - continue; - - common_start = max((size_t)res->start, start); - common_end = min((size_t)res->end, end); - if (common_start > common_end + 1) - continue; +#define AMD_NB_F0_NODE_ID 0x60 +#define AMD_NB_F0_UNIT_ID 0x64 +#define AMD_NB_F1_CONFIG_MAP_REG 0xe0 - final_start = min((size_t)res->start, start); - final_end = max((size_t)res->end, end); +#define RANGE_NUM 16 +#define AMD_NB_F1_CONFIG_MAP_RANGES 4 - res->start = final_start; - res->end = final_end; - return; - } - -addit: - - /* need to add that */ - if (info->res_num >= RES_NUM) - return; - - res = &info->res[info->res_num]; - res->name = info->name; - res->flags = flags; - res->start = start; - res->end = end; - res->child = NULL; - info->res_num++; -} - -struct pci_hostbridge_probe { +struct amd_hostbridge { u32 bus; u32 slot; - u32 vendor; u32 device; }; -static struct pci_hostbridge_probe pci_probes[] __initdata = { - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1100 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, - { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 }, +/* + * IMPORTANT NOTE: + * hb_probes[] and early_root_info_init() is in maintenance mode. + * It only supports K8, Fam10h, Fam11h, and Fam15h_00h-0fh . + * Future processor will rely on information in ACPI. + */ +static struct amd_hostbridge hb_probes[] __initdata = { + { 0, 0x18, 0x1100 }, /* K8 */ + { 0, 0x18, 0x1200 }, /* Family10h */ + { 0xff, 0, 0x1200 }, /* Family10h */ + { 0, 0x18, 0x1300 }, /* Family11h */ + { 0, 0x18, 0x1600 }, /* Family15h */ }; -static u64 __initdata fam10h_mmconf_start; -static u64 __initdata fam10h_mmconf_end; -static void __init get_pci_mmcfg_amd_fam10h_range(void) +static struct pci_root_info __init *find_pci_root_info(int node, int link) { - u32 address; - u64 base, msr; - unsigned segn_busn_bits; - - /* assume all cpus from fam10h have mmconf */ - if (boot_cpu_data.x86 < 0x10) - return; - - address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, msr); - - /* mmconfig is not enable */ - if (!(msr & FAM10H_MMIO_CONF_ENABLE)) - return; - - base = msr & (FAM10H_MMIO_CONF_BASE_MASK<> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) & - FAM10H_MMIO_CONF_BUSRANGE_MASK; + /* find the position */ + list_for_each_entry(info, &pci_root_infos, list) + if (info->node == node && info->link == link) + return info; - fam10h_mmconf_start = base; - fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; + return NULL; } /** - * early_fill_mp_bus_to_node() + * early_root_info_init() * called before pcibios_scan_root and pci_scan_bus - * fills the mp_bus_to_cpumask array based according to the LDT Bus Number - * Registers found in the K8 northbridge + * fills the mp_bus_to_cpumask array based according + * to the LDT Bus Number Registers found in the northbridge. */ -static int __init early_fill_mp_bus_info(void) +static int __init early_root_info_init(void) { int i; - int j; unsigned bus; unsigned slot; - int found; int node; int link; int def_node; int def_link; struct pci_root_info *info; u32 reg; - struct resource *res; - size_t start; - size_t end; - struct res_range range[RANGE_NUM]; + u64 start; + u64 end; + struct range range[RANGE_NUM]; u64 val; u32 address; + bool found; + struct resource fam10h_mmconf_res, *fam10h_mmconf; + u64 fam10h_mmconf_start; + u64 fam10h_mmconf_end; if (!early_pci_allowed()) return -1; - found = 0; - for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { + found = false; + for (i = 0; i < ARRAY_SIZE(hb_probes); i++) { u32 id; u16 device; u16 vendor; - bus = pci_probes[i].bus; - slot = pci_probes[i].slot; + bus = hb_probes[i].bus; + slot = hb_probes[i].slot; id = read_pci_config(bus, slot, 0, PCI_VENDOR_ID); - vendor = id & 0xffff; device = (id>>16) & 0xffff; - if (pci_probes[i].vendor == vendor && - pci_probes[i].device == device) { - found = 1; + + if (vendor != PCI_VENDOR_ID_AMD) + continue; + + if (hb_probes[i].device == device) { + found = true; break; } } @@ -269,11 +104,16 @@ static int __init early_fill_mp_bus_info if (!found) return 0; - pci_root_num = 0; - for (i = 0; i < 4; i++) { + /* + * We should learn topology and routing information from _PXM and + * _CRS methods in the ACPI namespace. We extract node numbers + * here to work around BIOSes that don't supply _PXM. + */ + for (i = 0; i < AMD_NB_F1_CONFIG_MAP_RANGES; i++) { int min_bus; int max_bus; - reg = read_pci_config(bus, slot, 1, 0xe0 + (i << 2)); + reg = read_pci_config(bus, slot, 1, + AMD_NB_F1_CONFIG_MAP_REG + (i << 2)); /* Check if that register is enabled for bus range */ if ((reg & 7) != 3) @@ -282,25 +122,26 @@ static int __init early_fill_mp_bus_info min_bus = (reg >> 16) & 0xff; max_bus = (reg >> 24) & 0xff; node = (reg >> 4) & 0x07; -#ifdef CONFIG_NUMA - for (j = min_bus; j <= max_bus; j++) - set_mp_bus_to_node(j, node); -#endif link = (reg >> 8) & 0x03; - info = &pci_root_info[pci_root_num]; - info->bus_min = min_bus; - info->bus_max = max_bus; - info->node = node; - info->link = link; - sprintf(info->name, "PCI Bus #%02x", min_bus); - pci_root_num++; + info = alloc_pci_root_info(min_bus, max_bus, node, link); } + /* + * The following code extracts routing information for use on old + * systems where Linux doesn't automatically use host bridge _CRS + * methods (or when the user specifies "pci=nocrs"). + * + * We only do this through Fam11h, because _CRS should be enough on + * newer systems. + */ + if (boot_cpu_data.x86 > 0x11) + return 0; + /* get the default node and link for left over res */ - reg = read_pci_config(bus, slot, 0, 0x60); + reg = read_pci_config(bus, slot, 0, AMD_NB_F0_NODE_ID); def_node = (reg >> 8) & 0x07; - reg = read_pci_config(bus, slot, 0, 0x64); + reg = read_pci_config(bus, slot, 0, AMD_NB_F0_UNIT_ID); def_link = (reg >> 8) & 0x03; memset(range, 0, sizeof(range)); @@ -317,34 +158,23 @@ static int __init early_fill_mp_bus_info link = (reg >> 4) & 0x03; end = (reg & 0xfff000) | 0xfff; - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == node && info->link == link) - break; - } - if (j == pci_root_num) + info = find_pci_root_info(node, link); + if (!info) continue; /* not found */ - info = &pci_root_info[j]; printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", - node, link, (u64)start, (u64)end); + node, link, start, end); /* kernel only handle 16 bit only */ if (end > 0xffff) end = 0xffff; update_res(info, start, end, IORESOURCE_IO, 1); - update_range(range, start, end); + subtract_range(range, RANGE_NUM, start, end); } /* add left over io port range to def node/link, [0, 0xffff] */ /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == def_node && info->link == def_link) - break; - } - if (j < pci_root_num) { - info = &pci_root_info[j]; + info = find_pci_root_info(def_node, def_link); + if (info) { for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; @@ -356,22 +186,27 @@ static int __init early_fill_mp_bus_info memset(range, 0, sizeof(range)); /* 0xfd00000000-0xffffffffff for HT */ - range[0].end = (0xfdULL<<32) - 1; + range[0].end = cap_resource((0xfdULL<<32) - 1); /* need to take out [0, TOM) for RAM*/ address = MSR_K8_TOP_MEM1; rdmsrl(address, val); end = (val & 0xffffff800000ULL); - printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); + printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20); if (end < (1ULL<<32)) - update_range(range, 0, end - 1); + subtract_range(range, RANGE_NUM, 0, end - 1); /* get mmconfig */ - get_pci_mmcfg_amd_fam10h_range(); + fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res); /* need to take out mmconf range */ - if (fam10h_mmconf_end) { - printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); - update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); + if (fam10h_mmconf) { + printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf); + fam10h_mmconf_start = fam10h_mmconf->start; + fam10h_mmconf_end = fam10h_mmconf->end; + subtract_range(range, RANGE_NUM, fam10h_mmconf_start, fam10h_mmconf_end); + } else { + fam10h_mmconf_start = 0; + fam10h_mmconf_end = 0; } /* mmio resource */ @@ -389,19 +224,13 @@ static int __init early_fill_mp_bus_info end <<= 8; end |= 0xffff; - /* find the position */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == node && info->link == link) - break; - } - if (j == pci_root_num) - continue; /* not found */ + info = find_pci_root_info(node, link); - info = &pci_root_info[j]; + if (!info) + continue; printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", - node, link, (u64)start, (u64)end); + node, link, start, end); /* * some sick allocation would have range overlap with fam10h * mmconf range, so need to update start and end. @@ -426,14 +255,14 @@ static int __init early_fill_mp_bus_info /* we got a hole */ endx = fam10h_mmconf_start - 1; update_res(info, start, endx, IORESOURCE_MEM, 0); - update_range(range, start, endx); - printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); + subtract_range(range, RANGE_NUM, start, endx); + printk(KERN_CONT " ==> [%llx, %llx]", start, endx); start = fam10h_mmconf_end + 1; changed = 1; } if (changed) { if (start <= end) { - printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); + printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end); } else { printk(KERN_CONT "%s\n", endx?"":" ==> none"); continue; @@ -441,8 +270,9 @@ static int __init early_fill_mp_bus_info } } - update_res(info, start, end, IORESOURCE_MEM, 1); - update_range(range, start, end); + update_res(info, cap_resource(start), cap_resource(end), + IORESOURCE_MEM, 1); + subtract_range(range, RANGE_NUM, start, end); printk(KERN_CONT "\n"); } @@ -456,63 +286,44 @@ static int __init early_fill_mp_bus_info address = MSR_K8_TOP_MEM2; rdmsrl(address, val); end = (val & 0xffffff800000ULL); - printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); - update_range(range, 1ULL<<32, end - 1); + printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20); + subtract_range(range, RANGE_NUM, 1ULL<<32, end - 1); } /* * add left over mmio range to def node/link ? * that is tricky, just record range in from start_min to 4G */ - for (j = 0; j < pci_root_num; j++) { - info = &pci_root_info[j]; - if (info->node == def_node && info->link == def_link) - break; - } - if (j < pci_root_num) { - info = &pci_root_info[j]; - + info = find_pci_root_info(def_node, def_link); + if (info) { for (i = 0; i < RANGE_NUM; i++) { if (!range[i].end) continue; - update_res(info, range[i].start, range[i].end, + update_res(info, cap_resource(range[i].start), + cap_resource(range[i].end), IORESOURCE_MEM, 1); } } - for (i = 0; i < pci_root_num; i++) { - int res_num; + list_for_each_entry(info, &pci_root_infos, list) { int busnum; + struct pci_root_res *root_res; - info = &pci_root_info[i]; - res_num = info->res_num; - busnum = info->bus_min; - printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", - info->bus_min, info->bus_max, info->node, info->link); - for (j = 0; j < res_num; j++) { - res = &info->res[j]; - printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", - busnum, j, - (res->flags & IORESOURCE_IO)?"io port":"mmio", - res->start, res->end); - } + busnum = info->busn.start; + printk(KERN_DEBUG "bus: %pR on node %x link %x\n", + &info->busn, info->node, info->link); + list_for_each_entry(root_res, &info->resources, list) + printk(KERN_DEBUG "bus: %02x %pR\n", + busnum, &root_res->res); } return 0; } -#else /* !CONFIG_X86_64 */ - -static int __init early_fill_mp_bus_info(void) { return 0; } - -#endif /* !CONFIG_X86_64 */ - -/* common 32/64 bit code */ - #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void enable_pci_io_ecs(void *unused) +static void __cpuinit enable_pci_io_ecs(void *unused) { u64 reg; rdmsrl(MSR_AMD64_NB_CFG, reg); @@ -541,14 +352,45 @@ static struct notifier_block __cpuinitda .notifier_call = amd_cpu_notify, }; +static void __init pci_enable_pci_io_ecs(void) +{ +#ifdef CONFIG_AMD_NB + unsigned int i, n; + + for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) { + u8 bus = amd_nb_bus_dev_ranges[i].bus; + u8 slot = amd_nb_bus_dev_ranges[i].dev_base; + u8 limit = amd_nb_bus_dev_ranges[i].dev_limit; + + for (; slot < limit; ++slot) { + u32 val = read_pci_config(bus, slot, 3, 0); + + if (!early_is_amd_nb(val)) + continue; + + val = read_pci_config(bus, slot, 3, 0x8c); + if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) { + val |= ENABLE_CF8_EXT_CFG >> 32; + write_pci_config(bus, slot, 3, 0x8c, val); + } + ++n; + } + } +#endif +} + static int __init pci_io_ecs_init(void) { int cpu; /* assume all cpus from fam10h have IO ECS */ - if (boot_cpu_data.x86 < 0x10) + if (boot_cpu_data.x86 < 0x10) return 0; + /* Try the PCI method first. */ + if (early_pci_allowed()) + pci_enable_pci_io_ecs(); + register_cpu_notifier(&amd_cpu_notifier); for_each_online_cpu(cpu) amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, @@ -563,8 +405,10 @@ static int __init amd_postcore_init(void if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return 0; - early_fill_mp_bus_info(); - pci_io_ecs_init(); + early_root_info_init(); + + if (boot_cpu_data.x86 <= 0x16) + pci_io_ecs_init(); return 0; } diff -uprN linux-2.6.32/arch/x86/pci/bus_numa.c linux-2.6.32.ovz/arch/x86/pci/bus_numa.c --- linux-2.6.32/arch/x86/pci/bus_numa.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/bus_numa.c 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,154 @@ +#include +#include +#include + +#include "bus_numa.h" + +LIST_HEAD(pci_root_infos); + +static struct pci_root_info *x86_find_pci_root_info(int bus) +{ + struct pci_root_info *info; + + list_for_each_entry(info, &pci_root_infos, list) + if (info->busn.start == bus) + return info; + + return NULL; +} + +int x86_pci_root_bus_node(int bus) +{ + struct pci_root_info *info = x86_find_pci_root_info(bus); + + if (!info) + return NUMA_NO_NODE; + + return info->node; +} + +void x86_pci_root_bus_resources(int bus, struct list_head *resources) +{ + struct pci_root_info *info = x86_find_pci_root_info(bus); + struct pci_root_res *root_res; + struct pci_host_bridge_window *window; + bool found = false; + + if (!info) + goto default_resources; + + printk(KERN_DEBUG "PCI: root bus %02x: hardware-probed resources\n", + bus); + + /* already added by acpi ? */ + list_for_each_entry(window, resources, list) + if (window->res->flags & IORESOURCE_BUS) { + found = true; + break; + } + + if (!found) + pci_add_resource(resources, &info->busn); + + list_for_each_entry(root_res, &info->resources, list) { + struct resource *res; + struct resource *root; + + res = &root_res->res; + pci_add_resource(resources, res); + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + else + root = &iomem_resource; + insert_resource(root, res); + } + return; + +default_resources: + /* + * We don't have any host bridge aperture information from the + * "native host bridge drivers," e.g., amd_bus or broadcom_bus, + * so fall back to the defaults historically used by pci_create_bus(). + */ + printk(KERN_DEBUG "PCI: root bus %02x: using default resources\n", bus); + pci_add_resource(resources, &ioport_resource); + pci_add_resource(resources, &iomem_resource); +} + +struct pci_root_info __init *alloc_pci_root_info(int bus_min, int bus_max, + int node, int link) +{ + struct pci_root_info *info; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + + if (!info) + return info; + + sprintf(info->name, "PCI Bus #%02x", bus_min); + + INIT_LIST_HEAD(&info->resources); + info->busn.name = info->name; + info->busn.start = bus_min; + info->busn.end = bus_max; + info->busn.flags = IORESOURCE_BUS; + info->node = node; + info->link = link; + + list_add_tail(&info->list, &pci_root_infos); + + return info; +} + +void __devinit update_res(struct pci_root_info *info, resource_size_t start, + resource_size_t end, unsigned long flags, int merge) +{ + struct resource *res; + struct pci_root_res *root_res; + + if (start > end) + return; + + if (start == MAX_RESOURCE) + return; + + if (!merge) + goto addit; + + /* try to merge it with old one */ + list_for_each_entry(root_res, &info->resources, list) { + resource_size_t final_start, final_end; + resource_size_t common_start, common_end; + + res = &root_res->res; + if (res->flags != flags) + continue; + + common_start = max(res->start, start); + common_end = min(res->end, end); + if (common_start > common_end + 1) + continue; + + final_start = min(res->start, start); + final_end = max(res->end, end); + + res->start = final_start; + res->end = final_end; + return; + } + +addit: + + /* need to add that */ + root_res = kzalloc(sizeof(*root_res), GFP_KERNEL); + if (!root_res) + return; + + res = &root_res->res; + res->name = info->name; + res->flags = flags; + res->start = start; + res->end = end; + + list_add_tail(&root_res->list, &info->resources); +} diff -uprN linux-2.6.32/arch/x86/pci/bus_numa.h linux-2.6.32.ovz/arch/x86/pci/bus_numa.h --- linux-2.6.32/arch/x86/pci/bus_numa.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/bus_numa.h 2015-03-10 13:24:08.000000000 -0700 @@ -0,0 +1,27 @@ +#ifndef __BUS_NUMA_H +#define __BUS_NUMA_H +/* + * sub bus (transparent) will use entres from 3 to store extra from + * root, so need to make sure we have enough slot there, Should we + * increase PCI_BUS_NUM_RESOURCES? + */ +struct pci_root_res { + struct list_head list; + struct resource res; +}; + +struct pci_root_info { + struct list_head list; + char name[12]; + struct list_head resources; + struct resource busn; + int node; + int link; +}; + +extern struct list_head pci_root_infos; +struct pci_root_info *alloc_pci_root_info(int bus_min, int bus_max, + int node, int link); +extern void update_res(struct pci_root_info *info, resource_size_t start, + resource_size_t end, unsigned long flags, int merge); +#endif diff -uprN linux-2.6.32/arch/x86/pci/common.c linux-2.6.32.ovz/arch/x86/pci/common.c --- linux-2.6.32/arch/x86/pci/common.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/common.c 2015-03-10 13:24:09.000000000 -0700 @@ -21,6 +21,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS unsigned int pci_early_dump_regs; static int pci_bf_sort; +static int smbios_type_b1_flag; int pci_routeirq; int noioapicquirk; #ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS @@ -30,7 +31,6 @@ int noioapicreroute = 1; #endif int pcibios_last_bus = -1; unsigned long pirq_table_addr; -struct pci_bus *pci_root_bus; struct pci_raw_ops *raw_pci_ops; struct pci_raw_ops *raw_pci_ext_ops; @@ -130,6 +130,23 @@ void __init dmi_check_skip_isa_align(voi static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev) { struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE]; + struct resource *bar_r; + int bar; + + if (pci_probe & PCI_NOASSIGN_BARS) { + /* + * If the BIOS did not assign the BAR, zero out the + * resource so the kernel doesn't attmept to assign + * it later on in pci_assign_unassigned_resources + */ + for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) { + bar_r = &dev->resource[bar]; + if (bar_r->start == 0 && bar_r->end != 0) { + bar_r->flags = 0; + bar_r->end = 0; + } + } + } if (pci_probe & PCI_NOASSIGN_ROMS) { if (rom_r->parent) @@ -151,9 +168,6 @@ void __devinit pcibios_fixup_bus(struct { struct pci_dev *dev; - /* root bus? */ - if (!b->parent) - x86_pci_root_bus_res_quirks(b); pci_read_bridge_bases(b); list_for_each_entry(dev, &b->devices, bus_list) pcibios_fixup_device_resources(dev); @@ -173,6 +187,39 @@ static int __devinit set_bf_sort(const s return 0; } +static void __devinit read_dmi_type_b1(const struct dmi_header *dm, + void *private_data) +{ + u8 *d = (u8 *)dm + 4; + + if (dm->type != 0xB1) + return; + switch (((*(u32 *)d) >> 9) & 0x03) { + case 0x00: + printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n"); + break; + case 0x01: /* set pci=bfsort */ + smbios_type_b1_flag = 1; + break; + case 0x02: /* do not set pci=bfsort */ + smbios_type_b1_flag = 2; + break; + default: + break; + } +} + +static int __devinit find_sort_method(const struct dmi_system_id *d) +{ + dmi_walk(read_dmi_type_b1, NULL); + + if (smbios_type_b1_flag == 1) { + set_bf_sort(d); + return 0; + } + return -1; +} + /* * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) */ @@ -241,6 +288,13 @@ static const struct dmi_system_id __devi }, }, { + .callback = find_sort_method, + .ident = "Dell System", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"), + }, + }, + { .callback = set_bf_sort, .ident = "HP ProLiant BL20p G3", .matches = { @@ -378,38 +432,28 @@ void __init dmi_check_pciprobe(void) dmi_check_system(pciprobe_dmi_table); } -struct pci_bus * __devinit pcibios_scan_root(int busnum) +void __devinit pcibios_scan_root(int busnum) { - struct pci_bus *bus = NULL; + struct pci_bus *bus; struct pci_sysdata *sd; + LIST_HEAD(resources); - while ((bus = pci_find_next_bus(bus)) != NULL) { - if (bus->number == busnum) { - /* Already scanned */ - return bus; - } - } - - /* Allocate per-root-bus (not per bus) arch-specific data. - * TODO: leak; this memory is never freed. - * It's arguable whether it's worth the trouble to care. - */ sd = kzalloc(sizeof(*sd), GFP_KERNEL); if (!sd) { - printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); - return NULL; + printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busnum); + return; } - - sd->node = get_mp_bus_to_node(busnum); - + sd->node = x86_pci_root_bus_node(busnum); + x86_pci_root_bus_resources(busnum, &resources); printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum); - bus = pci_scan_bus_parented(NULL, busnum, &pci_root_ops, sd); - if (!bus) + bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); + if (!bus) { + pci_free_resource_list(&resources); kfree(sd); - - return bus; + } } + extern u8 pci_cache_line_size; int __init pcibios_init(void) @@ -432,6 +476,22 @@ int __init pcibios_init(void) else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) pci_cache_line_size = 128 >> 2; /* P4 */ + if (c->x86_clflush_size != (pci_cache_line_size <<2)) + printk(KERN_DEBUG "PCI: old code would have set cacheline " + "size to %d bytes, but clflush_size = %d\n", + pci_cache_line_size << 2, + c->x86_clflush_size); + + /* Once we know this logic works, all the above code can be deleted. */ + if (c->x86_clflush_size > 0) { + pci_cache_line_size = c->x86_clflush_size >> 2; + printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n", + pci_cache_line_size << 2); + } else { + pci_cache_line_size = 32 >> 2; + printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n"); + } + pcibios_resource_survey(); if (pci_bf_sort >= pci_force_bf) @@ -512,12 +572,18 @@ char * __devinit pcibios_setup(char *st } else if (!strcmp(str, "norom")) { pci_probe |= PCI_NOASSIGN_ROMS; return NULL; + } else if (!strcmp(str, "nobar")) { + pci_probe |= PCI_NOASSIGN_BARS; + return NULL; } else if (!strcmp(str, "assign-busses")) { pci_probe |= PCI_ASSIGN_ALL_BUSSES; return NULL; } else if (!strcmp(str, "use_crs")) { pci_probe |= PCI_USE__CRS; return NULL; + } else if (!strcmp(str, "nocrs")) { + pci_probe |= PCI_ROOT_NO_CRS; + return NULL; } else if (!strcmp(str, "earlydump")) { pci_early_dump_regs = 1; return NULL; @@ -572,100 +638,3 @@ int pci_ext_cfg_avail(struct pci_dev *de else return 0; } - -struct pci_bus * __devinit pci_scan_bus_on_node(int busno, struct pci_ops *ops, int node) -{ - struct pci_bus *bus = NULL; - struct pci_sysdata *sd; - - /* - * Allocate per-root-bus (not per bus) arch-specific data. - * TODO: leak; this memory is never freed. - * It's arguable whether it's worth the trouble to care. - */ - sd = kzalloc(sizeof(*sd), GFP_KERNEL); - if (!sd) { - printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busno); - return NULL; - } - sd->node = node; - bus = pci_scan_bus(busno, ops, sd); - if (!bus) - kfree(sd); - - return bus; -} - -struct pci_bus * __devinit pci_scan_bus_with_sysdata(int busno) -{ - return pci_scan_bus_on_node(busno, &pci_root_ops, -1); -} - -/* - * NUMA info for PCI busses - * - * Early arch code is responsible for filling in reasonable values here. - * A node id of "-1" means "use current node". In other words, if a bus - * has a -1 node id, it's not tightly coupled to any particular chunk - * of memory (as is the case on some Nehalem systems). - */ -#ifdef CONFIG_NUMA - -#define BUS_NR 256 - -#ifdef CONFIG_X86_64 - -static int mp_bus_to_node[BUS_NR] = { - [0 ... BUS_NR - 1] = -1 -}; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node = -1; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return node; - - node = mp_bus_to_node[busnum]; - - /* - * let numa_node_id to decide it later in dma_alloc_pages - * if there is no ram on that node - */ - if (node != -1 && !node_online(node)) - node = -1; - - return node; -} - -#else /* CONFIG_X86_32 */ - -static int mp_bus_to_node[BUS_NR] = { - [0 ... BUS_NR - 1] = -1 -}; - -void set_mp_bus_to_node(int busnum, int node) -{ - if (busnum >= 0 && busnum < BUS_NR) - mp_bus_to_node[busnum] = (unsigned char) node; -} - -int get_mp_bus_to_node(int busnum) -{ - int node; - - if (busnum < 0 || busnum > (BUS_NR - 1)) - return 0; - node = mp_bus_to_node[busnum]; - return node; -} - -#endif /* CONFIG_X86_32 */ - -#endif /* CONFIG_NUMA */ diff -uprN linux-2.6.32/arch/x86/pci/fixup.c linux-2.6.32.ovz/arch/x86/pci/fixup.c --- linux-2.6.32/arch/x86/pci/fixup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/fixup.c 2015-03-10 13:24:08.000000000 -0700 @@ -7,6 +7,7 @@ #include #include #include +#include static void __devinit pci_fixup_i450nx(struct pci_dev *d) { @@ -25,9 +26,9 @@ static void __devinit pci_fixup_i450nx(s dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); if (busno) - pci_scan_bus_with_sysdata(busno); /* Bus A */ + pcibios_scan_root(busno); /* Bus A */ if (suba < subb) - pci_scan_bus_with_sysdata(suba+1); /* Bus B */ + pcibios_scan_root(suba+1); /* Bus B */ } pcibios_last_bus = -1; } @@ -42,7 +43,7 @@ static void __devinit pci_fixup_i450gx(s u8 busno; pci_read_config_byte(d, 0x4a, &busno); dev_info(&d->dev, "i440KX/GX host bridge; secondary bus %02x\n", busno); - pci_scan_bus_with_sysdata(busno); + pcibios_scan_root(busno); pcibios_last_bus = -1; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82454GX, pci_fixup_i450gx); @@ -164,11 +165,11 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_V */ static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && - (dev->device & 0xff00) == 0x2400) + if ((dev->device & 0xff00) == 0x2400) dev->transparent = 1; } -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge); +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, pci_fixup_transparent_bridge); /* * Fixup for C1 Halt Disconnect problem on nForce2 systems. @@ -322,9 +323,6 @@ static void __devinit pci_fixup_video(st struct pci_bus *bus; u16 config; - if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) - return; - /* Is VGA routed to us? */ bus = pdev->bus; while (bus) { @@ -351,9 +349,12 @@ static void __devinit pci_fixup_video(st if (config & (PCI_COMMAND_IO | PCI_COMMAND_MEMORY)) { pdev->resource[PCI_ROM_RESOURCE].flags |= IORESOURCE_ROM_SHADOW; dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); + if (!vga_default_device()) + vga_set_default_device(pdev); } } -DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = { diff -uprN linux-2.6.32/arch/x86/pci/i386.c linux-2.6.32.ovz/arch/x86/pci/i386.c --- linux-2.6.32/arch/x86/pci/i386.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/i386.c 2015-03-10 13:24:05.000000000 -0700 @@ -129,7 +129,9 @@ static void __init pcibios_allocate_bus_ continue; if (!r->start || pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); + dev_info(&dev->dev, + "can't reserve window %pR\n", + r); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -144,16 +146,29 @@ static void __init pcibios_allocate_bus_ } } +struct pci_check_idx_range { + int start; + int end; +}; + static void __init pcibios_allocate_resources(int pass) { struct pci_dev *dev = NULL; - int idx, disabled; + int idx, disabled, i; u16 command; struct resource *r; + struct pci_check_idx_range idx_range[] = { + { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END }, +#ifdef CONFIG_PCI_IOV + { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END }, +#endif + }; + for_each_pci_dev(dev) { pci_read_config_word(dev, PCI_COMMAND, &command); - for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { + for (i = 0; i < ARRAY_SIZE(idx_range); i++) + for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) { r = &dev->resource[idx]; if (r->parent) /* Already allocated */ continue; @@ -164,13 +179,15 @@ static void __init pcibios_allocate_reso else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", - (unsigned long long) r->start, - (unsigned long long) r->end, - r->flags, disabled, pass); + dev_dbg(&dev->dev, + "BAR %d: reserving %pr (d=%d, p=%d)\n", + idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { - dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx); + struct pci_dev_rh1 *rh1_pci_dev = dev->rh_reserved1; + dev_info(&dev->dev, + "can't reserve %pR\n", r); /* We'll assign a new address later */ + rh1_pci_dev->fw_addr[idx] = r->start; r->end -= r->start; r->start = 0; } @@ -182,7 +199,7 @@ static void __init pcibios_allocate_reso /* Turn the ROM off, leave the resource region, * but keep it unregistered. */ u32 reg; - dev_dbg(&dev->dev, "disabling ROM\n"); + dev_dbg(&dev->dev, "disabling ROM %pR\n", r); r->flags &= ~IORESOURCE_ROM_ENABLE; pci_read_config_dword(dev, dev->rom_base_reg, ®); @@ -242,10 +259,6 @@ void __init pcibios_resource_survey(void */ fs_initcall(pcibios_assign_resources); -void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b) -{ -} - /* * If we set up a device for bus mastering, we need to check the latency * timer as certain crappy BIOSes forget to set it properly. @@ -282,6 +295,15 @@ int pci_mmap_page_range(struct pci_dev * return -EINVAL; prot = pgprot_val(vma->vm_page_prot); + + /* + * Return error if pat is not enabled and write_combine is requested. + * Caller can followup with UC MINUS request and add a WC mtrr if there + * is a free mtrr slot. + */ + if (!pat_enabled && write_combine) + return -EINVAL; + if (pat_enabled && write_combine) prot |= _PAGE_CACHE_WC; else if (pat_enabled || boot_cpu_data.x86 > 3) diff -uprN linux-2.6.32/arch/x86/pci/irq.c linux-2.6.32.ovz/arch/x86/pci/irq.c --- linux-2.6.32/arch/x86/pci/irq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/irq.c 2015-03-10 13:24:08.000000000 -0700 @@ -137,13 +137,9 @@ static void __init pirq_peer_trick(void) busmap[e->bus] = 1; } for (i = 1; i < 256; i++) { - int node; if (!busmap[i] || pci_find_bus(0, i)) continue; - node = get_mp_bus_to_node(i); - if (pci_scan_bus_on_node(i, &pci_root_ops, node)) - printk(KERN_INFO "PCI: Discovered primary peer " - "bus %02x [IRQ]\n", i); + pcibios_scan_root(i); } pcibios_last_bus = -1; } @@ -585,19 +581,27 @@ static __init int intel_router_probe(str case PCI_DEVICE_ID_INTEL_ICH9_3: case PCI_DEVICE_ID_INTEL_ICH9_4: case PCI_DEVICE_ID_INTEL_ICH9_5: - case PCI_DEVICE_ID_INTEL_TOLAPAI_0: + case PCI_DEVICE_ID_INTEL_EP80579_0: case PCI_DEVICE_ID_INTEL_ICH10_0: case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: + case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0: + case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } - if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && - (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; diff -uprN linux-2.6.32/arch/x86/pci/legacy.c linux-2.6.32.ovz/arch/x86/pci/legacy.c --- linux-2.6.32/arch/x86/pci/legacy.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/legacy.c 2015-03-10 13:24:08.000000000 -0700 @@ -11,28 +11,14 @@ */ static void __devinit pcibios_fixup_peer_bridges(void) { - int n, devfn; - long node; + int n; if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff) return; DBG("PCI: Peer bridge fixup\n"); - for (n=0; n <= pcibios_last_bus; n++) { - u32 l; - if (pci_find_bus(0, n)) - continue; - node = get_mp_bus_to_node(n); - for (devfn = 0; devfn < 256; devfn += 8) { - if (!raw_pci_read(0, n, devfn, PCI_VENDOR_ID, 2, &l) && - l != 0x0000 && l != 0xffff) { - DBG("Found device at %02x:%02x [%04x]\n", n, devfn, l); - printk(KERN_INFO "PCI: Discovered peer bus %02x\n", n); - pci_scan_bus_on_node(n, &pci_root_ops, node); - break; - } - } - } + for (n=0; n <= pcibios_last_bus; n++) + pcibios_scan_specific_bus(n); } static int __init pci_legacy_init(void) @@ -46,13 +32,30 @@ static int __init pci_legacy_init(void) return 0; printk("PCI: Probing PCI hardware\n"); - pci_root_bus = pcibios_scan_root(0); - if (pci_root_bus) - pci_bus_add_devices(pci_root_bus); - + pcibios_scan_root(0); return 0; } +void __devinit pcibios_scan_specific_bus(int busn) +{ + int devfn; + u32 l; + + if (pci_find_bus(0, busn)) + return; + + for (devfn = 0; devfn < 256; devfn += 8) { + if (!raw_pci_read(0, busn, devfn, PCI_VENDOR_ID, 2, &l) && + l != 0x0000 && l != 0xffff) { + DBG("Found device at %02x:%02x [%04x]\n", busn, devfn, l); + printk(KERN_INFO "PCI: Discovered peer bus %02x\n", busn); + pcibios_scan_root(busn); + return; + } + } +} +EXPORT_SYMBOL_GPL(pcibios_scan_specific_bus); + int __init pci_subsys_init(void) { #ifdef CONFIG_X86_NUMAQ diff -uprN linux-2.6.32/arch/x86/pci/Makefile linux-2.6.32.ovz/arch/x86/pci/Makefile --- linux-2.6.32/arch/x86/pci/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/Makefile 2015-03-10 13:24:08.000000000 -0700 @@ -14,4 +14,10 @@ obj-$(CONFIG_X86_VISWS) += visws.o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o -obj-y += amd_bus.o +obj-y += bus_numa.o + +obj-$(CONFIG_AMD_NB) += amd_bus.o + +ifeq ($(CONFIG_PCI_DEBUG),y) +EXTRA_CFLAGS += -DDEBUG +endif diff -uprN linux-2.6.32/arch/x86/pci/mmconfig-shared.c linux-2.6.32.ovz/arch/x86/pci/mmconfig-shared.c --- linux-2.6.32/arch/x86/pci/mmconfig-shared.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/mmconfig-shared.c 2015-03-10 13:24:02.000000000 -0700 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -527,18 +528,31 @@ reject: static int __initdata known_bridge; -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; - /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; -static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) +static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg, + struct acpi_mcfg_allocation *cfg) { - if (!strcmp(mcfg->header.oem_id, "SGI")) - acpi_mcfg_64bit_base_addr = TRUE; + int year; - return 0; + if (cfg->address < 0xFFFFFFFF) + return 0; + + if (!strncmp(mcfg->header.oem_id, "SGI", 3)) + return 0; + + if (mcfg->header.revision >= 1) { + if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && + year >= 2010) + return 0; + } + + printk(KERN_ERR PREFIX "MCFG region for %04x:%02x-%02x at %#llx " + "is above 4GB, ignored\n", cfg->pci_segment, + cfg->start_bus_number, cfg->end_bus_number, cfg->address); + return -EINVAL; } static int __init pci_parse_mcfg(struct acpi_table_header *header) @@ -574,13 +588,8 @@ static int __init pci_parse_mcfg(struct memcpy(pci_mmcfg_config, &mcfg[1], config_size); - acpi_mcfg_oem_check(mcfg); - for (i = 0; i < pci_mmcfg_config_num; ++i) { - if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && - !acpi_mcfg_64bit_base_addr) { - printk(KERN_ERR PREFIX - "MMCONFIG not in low 4GB of memory\n"); + if (acpi_mcfg_check_entry(mcfg, &pci_mmcfg_config[i])) { kfree(pci_mmcfg_config); pci_mmcfg_config_num = 0; return -ENODEV; diff -uprN linux-2.6.32/arch/x86/pci/numaq_32.c linux-2.6.32.ovz/arch/x86/pci/numaq_32.c --- linux-2.6.32/arch/x86/pci/numaq_32.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/numaq_32.c 2015-03-10 13:24:08.000000000 -0700 @@ -137,11 +137,11 @@ static void __devinit pci_fixup_i450nx(s pxb, busno, suba, subb); if (busno) { /* Bus A */ - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); + pcibios_scan_root(QUADLOCAL2BUS(quad, busno)); } if (suba < subb) { /* Bus B */ - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1)); + pcibios_scan_root(QUADLOCAL2BUS(quad, suba+1)); } } pcibios_last_bus = -1; @@ -160,16 +160,14 @@ int __init pci_numaq_init(void) if (pcibios_scanned++) return 0; - pci_root_bus = pcibios_scan_root(0); - if (pci_root_bus) - pci_bus_add_devices(pci_root_bus); + pcibios_scan_root(0); if (num_online_nodes() > 1) for_each_online_node(quad) { if (quad == 0) continue; printk("Scanning PCI bus %d for quad %d\n", QUADLOCAL2BUS(quad,0), quad); - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0)); + pcibios_scan_root(QUADLOCAL2BUS(quad, 0)); } return 0; } diff -uprN linux-2.6.32/arch/x86/pci/visws.c linux-2.6.32.ovz/arch/x86/pci/visws.c --- linux-2.6.32/arch/x86/pci/visws.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/pci/visws.c 2015-03-10 13:24:08.000000000 -0700 @@ -86,8 +86,8 @@ int __init pci_visws_init(void) "bridge B (PIIX4) bus: %u\n", pci_bus1, pci_bus0); raw_pci_ops = &pci_direct_conf1; - pci_scan_bus_with_sysdata(pci_bus0); - pci_scan_bus_with_sysdata(pci_bus1); + pcibios_scan_root(pci_bus0); + pcibios_scan_root(pci_bus1); pci_fixup_irqs(pci_common_swizzle, visws_map_irq); pcibios_resource_survey(); return 0; diff -uprN linux-2.6.32/arch/x86/power/cpu.c linux-2.6.32.ovz/arch/x86/power/cpu.c --- linux-2.6.32/arch/x86/power/cpu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/power/cpu.c 2015-03-10 13:24:08.000000000 -0700 @@ -10,6 +10,7 @@ #include #include +#include #include #include @@ -110,6 +111,7 @@ static void __save_processor_state(struc void save_processor_state(void) { __save_processor_state(&saved_context); + x86_platform.save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -242,7 +244,9 @@ static void __restore_processor_state(st fix_processor_context(); do_fpu_end(); + x86_platform.restore_sched_clock_state(); mtrr_bp_restore(); + perf_restore_debug_store(); } /* Needed by apm.c */ diff -uprN linux-2.6.32/arch/x86/power/hibernate_asm_32.S linux-2.6.32.ovz/arch/x86/power/hibernate_asm_32.S --- linux-2.6.32/arch/x86/power/hibernate_asm_32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/power/hibernate_asm_32.S 2015-03-10 13:21:24.000000000 -0700 @@ -27,10 +27,17 @@ ENTRY(swsusp_arch_suspend) ret ENTRY(restore_image) + movl mmu_cr4_features, %ecx movl resume_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 + jecxz 1f # cr4 Pentium and higher, skip if zero + andl $~(X86_CR4_PGE), %ecx + movl %ecx, %cr4; # turn off PGE + movl %cr3, %eax; # flush TLB + movl %eax, %cr3 +1: movl restore_pblist, %edx .p2align 4,,7 @@ -54,16 +61,8 @@ done: movl $swapper_pg_dir, %eax subl $__PAGE_OFFSET, %eax movl %eax, %cr3 - /* Flush TLB, including "global" things (vmalloc) */ movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero - movl %ecx, %edx - andl $~(X86_CR4_PGE), %edx - movl %edx, %cr4; # turn off PGE -1: - movl %cr3, %eax; # flush TLB - movl %eax, %cr3 - jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %cr4; # turn PGE back on 1: diff -uprN linux-2.6.32/arch/x86/tools/chkobjdump.awk linux-2.6.32.ovz/arch/x86/tools/chkobjdump.awk --- linux-2.6.32/arch/x86/tools/chkobjdump.awk 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/tools/chkobjdump.awk 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,33 @@ +# GNU objdump version checker +# +# Usage: +# objdump -v | awk -f chkobjdump.awk +BEGIN { + # objdump version 2.19 or later is OK for the test. + od_ver = 2; + od_sver = 19; +} + +/^GNU objdump/ { + verstr = "" + for (i = 3; i <= NF; i++) + if (match($(i), "^[0-9]")) { + verstr = $(i); + break; + } + if (verstr == "") { + printf("Warning: Failed to find objdump version number.\n"); + exit 0; + } + split(verstr, ver, "."); + if (ver[1] > od_ver || + (ver[1] == od_ver && ver[2] >= od_sver)) { + exit 1; + } else { + printf("Warning: objdump version %s is older than %d.%d\n", + verstr, od_ver, od_sver); + print("Warning: Skipping posttest."); + # Logic is inverted, because we just skip test without error. + exit 0; + } +} diff -uprN linux-2.6.32/arch/x86/tools/distill.awk linux-2.6.32.ovz/arch/x86/tools/distill.awk --- linux-2.6.32/arch/x86/tools/distill.awk 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/tools/distill.awk 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,47 @@ +#!/bin/awk -f +# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len +# Distills the disassembly as follows: +# - Removes all lines except the disassembled instructions. +# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes +# into a single line. +# - Remove bad(or prefix only) instructions + +BEGIN { + prev_addr = "" + prev_hex = "" + prev_mnemonic = "" + bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))" + fwait_expr = "^9b " + fwait_str="9b\tfwait" +} + +/^ *[0-9a-f]+ <[^>]*>:/ { + # Symbol entry + printf("%s%s\n", $2, $1) +} + +/^ *[0-9a-f]+:/ { + if (split($0, field, "\t") < 3) { + # This is a continuation of the same insn. + prev_hex = prev_hex field[2] + } else { + # Skip bad instructions + if (match(prev_mnemonic, bad_expr)) + prev_addr = "" + # Split fwait from other f* instructions + if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") { + printf "%s\t%s\n", prev_addr, fwait_str + sub(fwait_expr, "", prev_hex) + } + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic + prev_addr = field[1] + prev_hex = field[2] + prev_mnemonic = field[3] + } +} + +END { + if (prev_addr != "") + printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic +} diff -uprN linux-2.6.32/arch/x86/tools/gen-insn-attr-x86.awk linux-2.6.32.ovz/arch/x86/tools/gen-insn-attr-x86.awk --- linux-2.6.32/arch/x86/tools/gen-insn-attr-x86.awk 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/tools/gen-insn-attr-x86.awk 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,378 @@ +#!/bin/awk -f +# gen-insn-attr-x86.awk: Instruction attribute table generator +# Written by Masami Hiramatsu +# +# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c + +# Awk implementation sanity check +function check_awk_implement() { + if (sprintf("%x", 0) != "0") + return "Your awk has a printf-format problem." + return "" +} + +# Clear working vars +function clear_vars() { + delete table + delete lptable2 + delete lptable1 + delete lptable3 + eid = -1 # escape id + gid = -1 # group id + aid = -1 # AVX id + tname = "" +} + +BEGIN { + # Implementation error checking + awkchecked = check_awk_implement() + if (awkchecked != "") { + print "Error: " awkchecked > "/dev/stderr" + print "Please try to use gawk." > "/dev/stderr" + exit 1 + } + + # Setup generating tables + print "/* x86 opcode map generated from x86-opcode-map.txt */" + print "/* Do not change this code. */\n" + ggid = 1 + geid = 1 + gaid = 0 + delete etable + delete gtable + delete atable + + opnd_expr = "^[A-Za-z/]" + ext_expr = "^\\(" + sep_expr = "^\\|$" + group_expr = "^Grp[0-9A-Za-z]+" + + imm_expr = "^[IJAO][a-z]" + imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" + imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" + imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)" + imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)" + imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)" + imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)" + imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" + imm_flag["Ob"] = "INAT_MOFFSET" + imm_flag["Ov"] = "INAT_MOFFSET" + + modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" + force64_expr = "\\([df]64\\)" + rex_expr = "^REX(\\.[XRWB]+)*" + fpu_expr = "^ESC" # TODO + + lprefix1_expr = "\\(66\\)" + lprefix2_expr = "\\(F3\\)" + lprefix3_expr = "\\(F2\\)" + max_lprefix = 4 + + vexok_expr = "\\(VEX\\)" + vexonly_expr = "\\(oVEX\\)" + + prefix_expr = "\\(Prefix\\)" + prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" + prefix_num["REPNE"] = "INAT_PFX_REPNE" + prefix_num["REP/REPE"] = "INAT_PFX_REPE" + prefix_num["LOCK"] = "INAT_PFX_LOCK" + prefix_num["SEG=CS"] = "INAT_PFX_CS" + prefix_num["SEG=DS"] = "INAT_PFX_DS" + prefix_num["SEG=ES"] = "INAT_PFX_ES" + prefix_num["SEG=FS"] = "INAT_PFX_FS" + prefix_num["SEG=GS"] = "INAT_PFX_GS" + prefix_num["SEG=SS"] = "INAT_PFX_SS" + prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" + prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2" + prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3" + + clear_vars() +} + +function semantic_error(msg) { + print "Semantic error at " NR ": " msg > "/dev/stderr" + exit 1 +} + +function debug(msg) { + print "DEBUG: " msg +} + +function array_size(arr, i,c) { + c = 0 + for (i in arr) + c++ + return c +} + +/^Table:/ { + print "/* " $0 " */" + if (tname != "") + semantic_error("Hit Table: before EndTable:."); +} + +/^Referrer:/ { + if (NF != 1) { + # escape opcode table + ref = "" + for (i = 2; i <= NF; i++) + ref = ref $i + eid = escape[ref] + tname = sprintf("inat_escape_table_%d", eid) + } +} + +/^AVXcode:/ { + if (NF != 1) { + # AVX/escape opcode table + aid = $2 + if (gaid <= aid) + gaid = aid + 1 + if (tname == "") # AVX only opcode table + tname = sprintf("inat_avx_table_%d", $2) + } + if (aid == -1 && eid == -1) # primary opcode table + tname = "inat_primary_table" +} + +/^GrpTable:/ { + print "/* " $0 " */" + if (!($2 in group)) + semantic_error("No group: " $2 ) + gid = group[$2] + tname = "inat_group_table_" gid +} + +function print_table(tbl,name,fmt,n) +{ + print "const insn_attr_t " name " = {" + for (i = 0; i < n; i++) { + id = sprintf(fmt, i) + if (tbl[id]) + print " [" id "] = " tbl[id] "," + } + print "};" +} + +/^EndTable/ { + if (gid != -1) { + # print group tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]", + "0x%x", 8) + gtable[gid,3] = tname "_3" + } + } else { + # print primary/escaped tables + if (array_size(table) != 0) { + print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,0] = tname + if (aid >= 0) + atable[aid,0] = tname + } + if (array_size(lptable1) != 0) { + print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,1] = tname "_1" + if (aid >= 0) + atable[aid,1] = tname "_1" + } + if (array_size(lptable2) != 0) { + print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,2] = tname "_2" + if (aid >= 0) + atable[aid,2] = tname "_2" + } + if (array_size(lptable3) != 0) { + print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]", + "0x%02x", 256) + etable[eid,3] = tname "_3" + if (aid >= 0) + atable[aid,3] = tname "_3" + } + } + print "" + clear_vars() +} + +function add_flags(old,new) { + if (old && new) + return old " | " new + else if (old) + return old + else + return new +} + +# convert operands to flags. +function convert_operands(count,opnd, i,j,imm,mod) +{ + imm = null + mod = null + for (j = 1; j <= count; j++) { + i = opnd[j] + if (match(i, imm_expr) == 1) { + if (!imm_flag[i]) + semantic_error("Unknown imm opnd: " i) + if (imm) { + if (i != "Ib") + semantic_error("Second IMM error") + imm = add_flags(imm, "INAT_SCNDIMM") + } else + imm = imm_flag[i] + } else if (match(i, modrm_expr)) + mod = "INAT_MODRM" + } + return add_flags(imm, mod) +} + +/^[0-9a-f]+\:/ { + if (NR == 1) + next + # get index + idx = "0x" substr($1, 1, index($1,":") - 1) + if (idx in table) + semantic_error("Redefine " idx " in " tname) + + # check if escaped opcode + if ("escape" == $2) { + if ($3 != "#") + semantic_error("No escaped name") + ref = "" + for (i = 4; i <= NF; i++) + ref = ref $i + if (ref in escape) + semantic_error("Redefine escape (" ref ")") + escape[ref] = geid + geid++ + table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")" + next + } + + variant = null + # converts + i = 2 + while (i <= NF) { + opcode = $(i++) + delete opnds + ext = null + flags = null + opnd = null + # parse one opcode + if (match($i, opnd_expr)) { + opnd = $i + count = split($(i++), opnds, ",") + flags = convert_operands(count, opnds) + } + if (match($i, ext_expr)) + ext = $(i++) + if (match($i, sep_expr)) + i++ + else if (i < NF) + semantic_error($i " is not a separator") + + # check if group opcode + if (match(opcode, group_expr)) { + if (!(opcode in group)) { + group[opcode] = ggid + ggid++ + } + flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")") + } + # check force(or default) 64bit + if (match(ext, force64_expr)) + flags = add_flags(flags, "INAT_FORCE64") + + # check REX prefix + if (match(opcode, rex_expr)) + flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)") + + # check coprocessor escape : TODO + if (match(opcode, fpu_expr)) + flags = add_flags(flags, "INAT_MODRM") + + # check VEX only code + if (match(ext, vexonly_expr)) + flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") + + # check VEX only code + if (match(ext, vexok_expr)) + flags = add_flags(flags, "INAT_VEXOK") + + # check prefixes + if (match(ext, prefix_expr)) { + if (!prefix_num[opcode]) + semantic_error("Unknown prefix: " opcode) + flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")") + } + if (length(flags) == 0) + continue + # check if last prefix + if (match(ext, lprefix1_expr)) { + lptable1[idx] = add_flags(lptable1[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix2_expr)) { + lptable2[idx] = add_flags(lptable2[idx],flags) + variant = "INAT_VARIANT" + } else if (match(ext, lprefix3_expr)) { + lptable3[idx] = add_flags(lptable3[idx],flags) + variant = "INAT_VARIANT" + } else { + table[idx] = add_flags(table[idx],flags) + } + } + if (variant) + table[idx] = add_flags(table[idx],variant) +} + +END { + if (awkchecked != "") + exit 1 + # print escape opcode map's array + print "/* Escape opcode map array */" + print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < geid; i++) + for (j = 0; j < max_lprefix; j++) + if (etable[i,j]) + print " ["i"]["j"] = "etable[i,j]"," + print "};\n" + # print group opcode map's array + print "/* Group opcode map array */" + print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < ggid; i++) + for (j = 0; j < max_lprefix; j++) + if (gtable[i,j]) + print " ["i"]["j"] = "gtable[i,j]"," + print "};\n" + # print AVX opcode map's array + print "/* AVX opcode map array */" + print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\ + "[INAT_LSTPFX_MAX + 1] = {" + for (i = 0; i < gaid; i++) + for (j = 0; j < max_lprefix; j++) + if (atable[i,j]) + print " ["i"]["j"] = "atable[i,j]"," + print "};" +} + diff -uprN linux-2.6.32/arch/x86/tools/Makefile linux-2.6.32.ovz/arch/x86/tools/Makefile --- linux-2.6.32/arch/x86/tools/Makefile 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/tools/Makefile 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,31 @@ +PHONY += posttest + +ifeq ($(KBUILD_VERBOSE),1) + posttest_verbose = -v +else + posttest_verbose = +endif + +ifeq ($(CONFIG_64BIT),y) + posttest_64bit = -y +else + posttest_64bit = -n +endif + +distill_awk = $(srctree)/arch/x86/tools/distill.awk +chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk + +quiet_cmd_posttest = TEST $@ + cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) + +posttest: $(obj)/test_get_len vmlinux + $(call cmd,posttest) + +hostprogs-y := test_get_len + +# -I needed for generated C source and C source which in the kernel tree. +HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ + +# Dependencies are also needed. +$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c + diff -uprN linux-2.6.32/arch/x86/tools/test_get_len.c linux-2.6.32.ovz/arch/x86/tools/test_get_len.c --- linux-2.6.32/arch/x86/tools/test_get_len.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/tools/test_get_len.c 2015-03-10 13:21:31.000000000 -0700 @@ -0,0 +1,173 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + */ + +#include +#include +#include +#include +#include + +#define unlikely(cond) (cond) + +#include +#include +#include + +/* + * Test of instruction analysis in general and insn_get_length() in + * particular. See if insn_get_length() and the disassembler agree + * on the length of each instruction in an elf disassembly. + * + * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len + */ + +const char *prog; +static int verbose; +static int x86_64; + +static void usage(void) +{ + fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |" + " %s [-y|-n] [-v]\n", prog); + fprintf(stderr, "\t-y 64bit mode\n"); + fprintf(stderr, "\t-n 32bit mode\n"); + fprintf(stderr, "\t-v verbose mode\n"); + exit(1); +} + +static void malformed_line(const char *line, int line_nr) +{ + fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line); + exit(3); +} + +static void dump_field(FILE *fp, const char *name, const char *indent, + struct insn_field *field) +{ + fprintf(fp, "%s.%s = {\n", indent, name); + fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n", + indent, field->value, field->bytes[0], field->bytes[1], + field->bytes[2], field->bytes[3]); + fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent, + field->got, field->nbytes); +} + +static void dump_insn(FILE *fp, struct insn *insn) +{ + fprintf(fp, "Instruction = {\n"); + dump_field(fp, "prefixes", "\t", &insn->prefixes); + dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); + dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); + dump_field(fp, "opcode", "\t", &insn->opcode); + dump_field(fp, "modrm", "\t", &insn->modrm); + dump_field(fp, "sib", "\t", &insn->sib); + dump_field(fp, "displacement", "\t", &insn->displacement); + dump_field(fp, "immediate1", "\t", &insn->immediate1); + dump_field(fp, "immediate2", "\t", &insn->immediate2); + fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n", + insn->attr, insn->opnd_bytes, insn->addr_bytes); + fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n", + insn->length, insn->x86_64, insn->kaddr); +} + +static void parse_args(int argc, char **argv) +{ + int c; + prog = argv[0]; + while ((c = getopt(argc, argv, "ynv")) != -1) { + switch (c) { + case 'y': + x86_64 = 1; + break; + case 'n': + x86_64 = 0; + break; + case 'v': + verbose = 1; + break; + default: + usage(); + } + } +} + +#define BUFSIZE 256 + +int main(int argc, char **argv) +{ + char line[BUFSIZE], sym[BUFSIZE] = ""; + unsigned char insn_buf[16]; + struct insn insn; + int insns = 0; + int warnings = 0; + + parse_args(argc, argv); + + while (fgets(line, BUFSIZE, stdin)) { + char copy[BUFSIZE], *s, *tab1, *tab2; + int nb = 0; + unsigned int b; + + if (line[0] == '<') { + /* Symbol line */ + strcpy(sym, line); + continue; + } + + insns++; + memset(insn_buf, 0, 16); + strcpy(copy, line); + tab1 = strchr(copy, '\t'); + if (!tab1) + malformed_line(line, insns); + s = tab1 + 1; + s += strspn(s, " "); + tab2 = strchr(s, '\t'); + if (!tab2) + malformed_line(line, insns); + *tab2 = '\0'; /* Characters beyond tab2 aren't examined */ + while (s < tab2) { + if (sscanf(s, "%x", &b) == 1) { + insn_buf[nb++] = (unsigned char) b; + s += 3; + } else + break; + } + /* Decode an instruction */ + insn_init(&insn, insn_buf, x86_64); + insn_get_length(&insn); + if (insn.length != nb) { + warnings++; + fprintf(stderr, "Warning: %s found difference at %s\n", + prog, sym); + fprintf(stderr, "Warning: %s", line); + fprintf(stderr, "Warning: objdump says %d bytes, but " + "insn_get_length() says %d\n", nb, + insn.length); + if (verbose) + dump_insn(stderr, &insn); + } + } + if (warnings) + fprintf(stderr, "Warning: decoded and checked %d" + " instructions with %d warnings\n", insns, warnings); + else + fprintf(stderr, "Succeed: decoded and checked %d" + " instructions\n", insns); + return 0; +} diff -uprN linux-2.6.32/arch/x86/vdso/Makefile linux-2.6.32.ovz/arch/x86/vdso/Makefile --- linux-2.6.32/arch/x86/vdso/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/Makefile 2015-06-23 04:16:16.000000000 -0700 @@ -6,7 +6,7 @@ VDSO64-$(CONFIG_X86_64) := y VDSO32-$(CONFIG_X86_32) := y VDSO32-$(CONFIG_COMPAT) := y -vdso-install-$(VDSO64-y) += vdso.so +vdso-install-$(VDSO64-y) += vdso-rhel5.so vdso.so vdso-install-$(VDSO32-y) += $(vdso32-images) @@ -14,24 +14,31 @@ vdso-install-$(VDSO32-y) += $(vdso32-ima vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o # files to link into kernel -obj-$(VDSO64-y) += vma.o vdso.o +obj-$(VDSO64-y) += vma.o vdso.o vdso-rhel5.o obj-$(VDSO32-y) += vdso32.o vdso32-setup.o vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so +$(obj)/vdso-rhel5.o: $(obj)/vdso-rhel5.so -targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) +targets += vdso-rhel5.so vdso.so vdso-rhel5.so.dbg vdso.so.dbg vdso.lds vdso-rhel5.lds $(vobjs-y) export CPPFLAGS_vdso.lds += -P -C -VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 +VDSO_LDFLAGS_vdso-rhel5.lds = ${VDSO_LDFLAGS_vdso.lds} + $(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so +$(obj)/vdso-rhel5.o: $(src)/vdso-rhel5.S $(obj)/vdso-rhel5.so + $(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) +$(obj)/vdso-rhel5.so.dbg: $(src)/vdso-rhel5.lds $(vobjs) FORCE + $(call if_changed,vdso) $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg FORCE @@ -44,6 +51,7 @@ $(vobjs): KBUILD_CFLAGS += $(CFL) targets += vdso-syms.lds obj-$(VDSO64-y) += vdso-syms.lds +obj-$(VDSO64-y) += vdso-rhel5-syms.lds # # Match symbols in the DSO that look like VDSO*; produce a file of constants. @@ -69,7 +77,7 @@ vdso32.so-$(VDSO32-y) += sysenter vdso32-images = $(vdso32.so-y:%=vdso32-%.so) CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1 # This makes sure the $(obj) subdirectory exists even though vdso32/ # is not a kbuild sub-make subdirectory. @@ -120,7 +128,8 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%= quiet_cmd_vdso = VDSO $@ cmd_vdso = $(CC) -nostdlib -o $@ \ $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) \ + $(if $(AFTER_LINK),; $(AFTER_LINK)) VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) GCOV_PROFILE := n diff -uprN linux-2.6.32/arch/x86/vdso/vclock_gettime.c linux-2.6.32.ovz/arch/x86/vdso/vclock_gettime.c --- linux-2.6.32/arch/x86/vdso/vclock_gettime.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vclock_gettime.c 2015-06-23 04:16:16.000000000 -0700 @@ -34,10 +34,10 @@ notrace static long vdso_fallback_gettim return ret; } -notrace static inline long vgetns(void) +notrace static inline unsigned long vgetns(void) { - long v; - cycles_t (*vread)(void); + cycle_t v; + cycle_t (*vread)(void); vread = gtod->clock.vread; v = (vread() - gtod->clock.cycle_last) & gtod->clock.mask; return (v * gtod->clock.mult) >> gtod->clock.shift; @@ -111,7 +111,7 @@ notrace static noinline int do_monotonic return 0; } -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +notrace noinline int ___vdso_clock_gettime(clockid_t clock, struct timespec *ts) { if (likely(gtod->sysctl_enabled)) switch (clock) { @@ -120,20 +120,29 @@ notrace int __vdso_clock_gettime(clockid return do_realtime(ts); break; case CLOCK_MONOTONIC: - if (likely(gtod->clock.vread)) + if (likely(gtod->clock.vread && + gtod->gettime_monotonic_enabled)) return do_monotonic(ts); break; case CLOCK_REALTIME_COARSE: return do_realtime_coarse(ts); case CLOCK_MONOTONIC_COARSE: - return do_monotonic_coarse(ts); + if (likely(gtod->gettime_monotonic_enabled)) + return do_monotonic_coarse(ts); } return vdso_fallback_gettime(clock, ts); } + +int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) __attribute__((section("CLOCKGETTIME"))); +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + return ___vdso_clock_gettime(clock, ts); +} + int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); -notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +notrace int ___vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { long ret; if (likely(gtod->sysctl_enabled && gtod->clock.vread)) { @@ -155,5 +164,12 @@ notrace int __vdso_gettimeofday(struct t "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); return ret; } + +int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) __attribute__((section("GETTIMEOFDAY"))); +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + return ___vdso_gettimeofday(tv, tz); +} + int gettimeofday(struct timeval *, struct timezone *) __attribute__((weak, alias("__vdso_gettimeofday"))); diff -uprN linux-2.6.32/arch/x86/vdso/vdso32/int80.S linux-2.6.32.ovz/arch/x86/vdso/vdso32/int80.S --- linux-2.6.32/arch/x86/vdso/vdso32/int80.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso32/int80.S 2015-06-23 04:16:16.000000000 -0700 @@ -1,15 +1,15 @@ /* * Code for the vDSO. This version uses the old int $0x80 method. * - * First get the common code for the sigreturn entry points. - * This must come first. + * NOTE: + * 1) __kernel_vsyscall _must_ be first in this page. + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S + * for details. */ -#include "sigreturn.S" .text .globl __kernel_vsyscall .type __kernel_vsyscall,@function - ALIGN __kernel_vsyscall: .LSTART_vsyscall: int $0x80 @@ -54,3 +54,8 @@ VDSO32_vsyscall_eh_frame_size = 0x40 .section .data,"aw",@progbits .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 .previous + +/* + * Get the common code for the sigreturn entry points. + */ +#include "sigreturn.S" diff -uprN linux-2.6.32/arch/x86/vdso/vdso32/note.S linux-2.6.32.ovz/arch/x86/vdso/vdso32/note.S --- linux-2.6.32/arch/x86/vdso/vdso32/note.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso32/note.S 2015-06-23 04:16:16.000000000 -0700 @@ -9,7 +9,9 @@ /* Ideally this would use UTS_NAME, but using a quoted string here doesn't work. Remember to change this when changing the kernel's name. */ + .globl vdso_linux_version_code ELFNOTE_START(Linux, 0, "a") +vdso_linux_version_code: .long LINUX_VERSION_CODE ELFNOTE_END diff -uprN linux-2.6.32/arch/x86/vdso/vdso32/sigreturn.S linux-2.6.32.ovz/arch/x86/vdso/vdso32/sigreturn.S --- linux-2.6.32/arch/x86/vdso/vdso32/sigreturn.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso32/sigreturn.S 2015-06-23 04:16:16.000000000 -0700 @@ -15,6 +15,7 @@ #endif .text + .org __kernel_vsyscall+0x100,0x90 .globl __kernel_sigreturn .type __kernel_sigreturn,@function ALIGN @@ -27,6 +28,7 @@ __kernel_sigreturn: nop .size __kernel_sigreturn,.-.LSTART_sigreturn + .org __kernel_vsyscall+0x200,0x90 .globl __kernel_rt_sigreturn .type __kernel_rt_sigreturn,@function ALIGN diff -uprN linux-2.6.32/arch/x86/vdso/vdso32/syscall.S linux-2.6.32.ovz/arch/x86/vdso/vdso32/syscall.S --- linux-2.6.32/arch/x86/vdso/vdso32/syscall.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso32/syscall.S 2015-06-23 04:16:16.000000000 -0700 @@ -1,11 +1,12 @@ /* * Code for the vDSO. This version uses the syscall instruction. * - * First get the common code for the sigreturn entry points. - * This must come first. + * NOTE: + * 1) __kernel_vsyscall _must_ be first in this page. + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S + * for details. */ #define SYSCALL_ENTER_KERNEL syscall -#include "sigreturn.S" #include @@ -75,3 +76,7 @@ VDSO32_vsyscall_eh_frame_size = 0x40 .section .data,"aw",@progbits .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 .previous +/* + * Get the common code for the sigreturn entry points. + */ +#include "sigreturn.S" diff -uprN linux-2.6.32/arch/x86/vdso/vdso32/sysenter.S linux-2.6.32.ovz/arch/x86/vdso/vdso32/sysenter.S --- linux-2.6.32/arch/x86/vdso/vdso32/sysenter.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso32/sysenter.S 2015-06-23 04:16:16.000000000 -0700 @@ -1,10 +1,11 @@ /* * Code for the vDSO. This version uses the sysenter instruction. * - * First get the common code for the sigreturn entry points. - * This must come first. + * NOTE: + * 1) __kernel_vsyscall _must_ be first in this page. + * 2) there are alignment constraints on this stub, see vsyscall-sigreturn.S + * for details. */ -#include "sigreturn.S" /* * The caller puts arg2 in %ecx, which gets pushed. The kernel will use @@ -27,7 +28,6 @@ .text .globl __kernel_vsyscall .type __kernel_vsyscall,@function - ALIGN __kernel_vsyscall: .LSTART_vsyscall: push %ecx @@ -38,9 +38,12 @@ __kernel_vsyscall: .Lenter_kernel: movl %esp,%ebp sysenter +__kernel_vsyscall_end: /* 7: align return point with nop's to make disassembly easier */ - .space 7,0x90 + /* For CPT: VSYSCALL32_SYSEXIT value must 0x420 for backward + compatibility with another kernel */ + .space __kernel_vsyscall + 0x20 - __kernel_vsyscall_end - 2 ,0x90 /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ jmp .Lenter_kernel @@ -114,3 +117,8 @@ VDSO32_SYSENTER_RETURN: /* Symbol used b * to verify it matches the other versions. */ VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) + +/* + * Get the common code for the sigreturn entry points. + */ +#include "sigreturn.S" diff -uprN linux-2.6.32/arch/x86/vdso/vdso32/vdso32.lds.S linux-2.6.32.ovz/arch/x86/vdso/vdso32/vdso32.lds.S --- linux-2.6.32/arch/x86/vdso/vdso32/vdso32.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso32/vdso32.lds.S 2015-06-23 04:16:16.000000000 -0700 @@ -35,3 +35,4 @@ VDSO32_PRELINK = VDSO_PRELINK; VDSO32_vsyscall = __kernel_vsyscall; VDSO32_sigreturn = __kernel_sigreturn; VDSO32_rt_sigreturn = __kernel_rt_sigreturn; +VDSO32_linux_version_code = vdso_linux_version_code; diff -uprN linux-2.6.32/arch/x86/vdso/vdso32-setup.c linux-2.6.32.ovz/arch/x86/vdso/vdso32-setup.c --- linux-2.6.32/arch/x86/vdso/vdso32-setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso32-setup.c 2015-06-23 04:16:16.000000000 -0700 @@ -16,6 +16,11 @@ #include #include #include +#include +#include +#include + +#include #include #include @@ -193,7 +198,8 @@ static __init void relocate_vdso(Elf32_E } } -static struct page *vdso32_pages[1]; +struct page *vdso32_pages[1]; +EXPORT_SYMBOL(vdso32_pages); #ifdef CONFIG_X86_64 @@ -250,13 +256,7 @@ static int __init gate_vma_init(void) gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; } @@ -309,16 +309,139 @@ int __init sysenter_setup(void) return 0; } +EXPORT_SYMBOL_GPL(VDSO32_SYSENTER_RETURN); +EXPORT_SYMBOL_GPL(VDSO32_PRELINK); + +static DEFINE_MUTEX(vdso32_mutex); + +static struct page **uts_prep_vdso_pages_locked(int map) +{ + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + struct mm_struct *mm = current->mm; + struct ve_struct *ve = get_exec_env(); + struct page **pages = vdso32_pages; + int n1, n2, n3, new_version; + struct page **new_pages, **p; + void *addr; + + /* + * Simply reuse vDSO pages if we can. + */ + if (uts_ns == &init_uts_ns) + return vdso32_pages; + + /* + * Dirty lockless hack. Strictly speaking + * we need to return @p here if it's non-nil, + * but since there only one trasition possible + * { =0 ; !=0 } we simply return @uts_ns->vdso32.pages + */ + p = ACCESS_ONCE(uts_ns->vdso32.pages); + smp_read_barrier_depends(); + if (p) + return uts_ns->vdso32.pages; + + up_write(&mm->mmap_sem); + + if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) { + /* + * If there were no changes on version simply reuse + * preallocated one. + */ + new_version = KERNEL_VERSION(n1, n2, n3); + if (new_version == LINUX_VERSION_CODE) + goto out; +#ifdef CONFIG_X86_32 + else { + /* + * Native x86-32 mode requires vDSO runtime + * relocations applied which is not supported + * in the old vanilla kernels, moreover even + * being ported we would break compatibility + * with rhel5 vdso which has addresses hardcoded. + * Thus simply warn about this problem and + * continue execution without virtualization. + * After all i686 is pretty outdated nowadays. + */ + pr_warn_once("x86-32 vDSO virtualization is not supported."); + goto out; + } +#endif + } else { + /* + * If admin is passed malformed string here + * lets warn him once but continue working + * not using vDSO virtualization at all. It's + * better than walk out with error. + */ + pr_warn_once("Wrong release uts name format detected." + " Ignoring vDSO virtualization.\n"); + goto out; + } + + mutex_lock(&vdso32_mutex); + if (uts_ns->vdso32.pages) { + pages = uts_ns->vdso32.pages; + goto out_unlock; + } + + uts_ns->vdso32.nr_pages = 1; + uts_ns->vdso32.size = PAGE_SIZE; + uts_ns->vdso32.version_off= (unsigned long)VDSO32_SYMBOL(0, linux_version_code); + new_pages = kmalloc(sizeof(struct page *), GFP_KERNEL); + if (!new_pages) { + pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid); + pages = ERR_PTR(-ENOMEM); + goto out_unlock; + } + + new_pages[0] = alloc_page(GFP_KERNEL); + if (!new_pages[0]) { + pr_err("Can't allocate page for VE %d\n", ve->veid); + kfree(new_pages); + pages = ERR_PTR(-ENOMEM); + goto out_unlock; + } + + copy_page(page_address(new_pages[0]), page_address(vdso32_pages[0])); + + addr = page_address(new_pages[0]); + *((int *)(addr + uts_ns->vdso32.version_off)) = new_version; + smp_wmb(); + + pages = uts_ns->vdso32.pages = new_pages; + + pr_debug("vDSO version transition %d -> %d for VE %d\n", + LINUX_VERSION_CODE, new_version, ve->veid); + +out_unlock: + mutex_unlock(&vdso32_mutex); +out: + down_write(&mm->mmap_sem); + return pages; +} + /* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, + unsigned long map_address) { struct mm_struct *mm = current->mm; - unsigned long addr; + unsigned long addr = map_address; int ret = 0; bool compat; + unsigned long flags; - if (vdso_enabled == VDSO_DISABLED) + if (vdso_enabled == VDSO_DISABLED && map_address == 0) { + current->mm->context.vdso = NULL; return 0; + } + + flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC | VM_MAYWRITE | + mm->def_flags; + + ret = -ENOMEM; + if (ub_memory_charge(mm, PAGE_SIZE, flags, NULL, UB_SOFT)) + goto err_charge; down_write(&mm->mmap_sem); @@ -328,33 +451,31 @@ int arch_setup_additional_pages(struct l map_compat_vdso(compat); - if (compat) - addr = VDSO_HIGH_BASE; - else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (!compat || map_address) { + addr = get_unmapped_area_prot(NULL, addr, PAGE_SIZE, 0, 0, 1); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - } + } else + addr = VDSO_HIGH_BASE; current->mm->context.vdso = (void *)addr; - if (compat_uses_vma || !compat) { + if (compat_uses_vma || !compat || map_address) { + struct page **pages = uts_prep_vdso_pages_locked(compat); + if (IS_ERR(pages)) { + ret = PTR_ERR(pages); + goto up_fail; + } + /* * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully - * interpretable later without matching up the same - * kernel and hardware config to see what PC values - * meant. */ ret = install_special_mapping(mm, addr, PAGE_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, - vdso32_pages); + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + pages); if (ret) goto up_fail; @@ -368,9 +489,13 @@ int arch_setup_additional_pages(struct l current->mm->context.vdso = NULL; up_write(&mm->mmap_sem); + if (ret < 0) + ub_memory_uncharge(mm, PAGE_SIZE, flags, NULL); +err_charge: return ret; } +EXPORT_SYMBOL(arch_setup_additional_pages); #ifdef CONFIG_X86_64 @@ -418,24 +543,25 @@ const char *arch_vma_name(struct vm_area return NULL; } -struct vm_area_struct *get_gate_vma(struct task_struct *tsk) +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { - struct mm_struct *mm = tsk->mm; - - /* Check to see if this task was created in compat vdso mode */ + /* + * Check to see if the corresponding task was created in compat vdso + * mode. + */ if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) return &gate_vma; return NULL; } -int in_gate_area(struct task_struct *task, unsigned long addr) +int in_gate_area(struct mm_struct *mm, unsigned long addr) { - const struct vm_area_struct *vma = get_gate_vma(task); + const struct vm_area_struct *vma = get_gate_vma(mm); return vma && addr >= vma->vm_start && addr < vma->vm_end; } -int in_gate_area_no_task(unsigned long addr) +int in_gate_area_no_mm(unsigned long addr) { return 0; } diff -uprN linux-2.6.32/arch/x86/vdso/vdso-layout.lds.S linux-2.6.32.ovz/arch/x86/vdso/vdso-layout.lds.S --- linux-2.6.32/arch/x86/vdso/vdso-layout.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso-layout.lds.S 2015-06-23 04:16:16.000000000 -0700 @@ -21,6 +21,12 @@ SECTIONS .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr .eh_frame : { KEEP (*(.eh_frame)) } :text +#ifdef VDSO_RHEL5 + .gettimeofday 0xffffffffff7006f0 : { *(GETTIMEOFDAY) } :text =0x90909090 + .clockgettime 0xffffffffff700780 : { *(CLOCKGETTIME) } :text =0x90909090 + .getcpu 0xffffffffff7007c0 : { *(GETCPU) } :text =0x90909090 +#endif + .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) } :text @@ -43,6 +49,15 @@ SECTIONS */ . = ALIGN(0x100); +#ifndef VDSO_RHEL5 + /* + * All global symbols have fixed adresses to be able migrate between + * different versions. + */ + .gettimeofday 0xffffffffff700800 : { *(GETTIMEOFDAY) } :text =0x90909090 + .clockgettime 0xffffffffff700820 : { *(CLOCKGETTIME) } :text =0x90909090 + .getcpu 0xffffffffff700840 : { *(GETCPU) } :text =0x90909090 +#endif .text : { *(.text*) } :text =0x90909090 } diff -uprN linux-2.6.32/arch/x86/vdso/vdso.lds.S linux-2.6.32.ovz/arch/x86/vdso/vdso.lds.S --- linux-2.6.32/arch/x86/vdso/vdso.lds.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso.lds.S 2015-06-23 04:16:16.000000000 -0700 @@ -34,4 +34,5 @@ VDSO64_PRELINK = VDSO_PRELINK; */ #define VEXTERN(x) VDSO64_ ## x = vdso_ ## x; #include "vextern.h" +VEXTERN(linux_version_code) #undef VEXTERN diff -uprN linux-2.6.32/arch/x86/vdso/vdso-note.S linux-2.6.32.ovz/arch/x86/vdso/vdso-note.S --- linux-2.6.32/arch/x86/vdso/vdso-note.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso-note.S 2015-06-23 04:16:16.000000000 -0700 @@ -7,6 +7,8 @@ #include #include + .globl vdso_linux_version_code ELFNOTE_START(Linux, 0, "a") +vdso_linux_version_code: .long LINUX_VERSION_CODE ELFNOTE_END diff -uprN linux-2.6.32/arch/x86/vdso/vdso-rhel5.lds.S linux-2.6.32.ovz/arch/x86/vdso/vdso-rhel5.lds.S --- linux-2.6.32/arch/x86/vdso/vdso-rhel5.lds.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso-rhel5.lds.S 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,38 @@ +/* + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * Here we only choose the prelinked virtual address. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. We can define local symbols here called VDSO* to make their + * values visible using the asm-x86/vdso.h macros from the kernel proper. + */ + +#define VDSO_PRELINK 0xffffffffff700000 +#define VDSO_RHEL5 +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + local: *; + }; +} + +VDSO64_PRELINK = VDSO_PRELINK; + +/* + * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL. + */ +#define VEXTERN(x) VDSO64_rhel5_ ## x = vdso_ ## x; +#include "vextern.h" +#undef VEXTERN diff -uprN linux-2.6.32/arch/x86/vdso/vdso-rhel5.S linux-2.6.32.ovz/arch/x86/vdso/vdso-rhel5.S --- linux-2.6.32/arch/x86/vdso/vdso-rhel5.S 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vdso-rhel5.S 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,10 @@ +#include + +__INITDATA + + .globl vdso_rhel5_start, vdso_rhel5_end +vdso_rhel5_start: + .incbin "arch/x86/vdso/vdso-rhel5.so" +vdso_rhel5_end: + +__FINIT diff -uprN linux-2.6.32/arch/x86/vdso/vgetcpu.c linux-2.6.32.ovz/arch/x86/vdso/vgetcpu.c --- linux-2.6.32/arch/x86/vdso/vgetcpu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vgetcpu.c 2015-06-23 04:16:16.000000000 -0700 @@ -13,8 +13,8 @@ #include #include "vextern.h" -notrace long -__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +notrace noinline long +___vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; @@ -32,5 +32,13 @@ __vdso_getcpu(unsigned *cpu, unsigned *n return 0; } +int long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) + __attribute__((section("GETCPU"))); +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + return ___vdso_getcpu(cpu, node, unused); +} + long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) __attribute__((weak, alias("__vdso_getcpu"))); diff -uprN linux-2.6.32/arch/x86/vdso/vma.c linux-2.6.32.ovz/arch/x86/vdso/vma.c --- linux-2.6.32/arch/x86/vdso/vma.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/vdso/vma.c 2015-06-23 04:16:16.000000000 -0700 @@ -4,6 +4,7 @@ * Subject to the GPL, v.2 */ #include +#include #include #include #include @@ -14,17 +15,25 @@ #include #include +#include +#include +#include + #include "vextern.h" /* Just for VMAGIC. */ #undef VEXTERN unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; +extern char vdso_rhel5_start[], vdso_rhel5_end[]; extern unsigned short vdso_sync_cpuid; static struct page **vdso_pages; static unsigned vdso_size; +static struct page **vdso_rhel5_pages; +static unsigned vdso_rhel5_size; + static inline void *var_ref(void *p, char *name) { if (*(void **)p != (void *)VMAGIC) { @@ -62,6 +71,12 @@ static int __init init_vdso_vars(void) vdso_enabled = 0; } + init_uts_ns.vdso.addr = vbase; + init_uts_ns.vdso.pages = vdso_pages; + init_uts_ns.vdso.nr_pages = npages; + init_uts_ns.vdso.size = vdso_size; + init_uts_ns.vdso.version_off = (unsigned long)VDSO64_SYMBOL(0, linux_version_code); + #define VEXTERN(x) \ *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; #include "vextern.h" @@ -75,41 +90,113 @@ static int __init init_vdso_vars(void) } __initcall(init_vdso_vars); +static int __init init_vdso_rhel5_vars(void) +{ + int npages = (vdso_rhel5_end - vdso_rhel5_start + PAGE_SIZE - 1) / PAGE_SIZE; + int i; + char *vbase; + + vdso_rhel5_size = npages << PAGE_SHIFT; + vdso_rhel5_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); + if (!vdso_rhel5_pages) + goto oom; + for (i = 0; i < npages; i++) { + struct page *p; + p = alloc_page(GFP_KERNEL); + if (!p) + goto oom; + vdso_rhel5_pages[i] = p; + copy_page(page_address(p), vdso_rhel5_start + i*PAGE_SIZE); + } + + vbase = vmap(vdso_rhel5_pages, npages, 0, PAGE_KERNEL); + if (!vbase) + goto oom; + + if (memcmp(vbase, "\177ELF", 4)) { + printk("VDSO: I'm broken; not ELF\n"); + vdso_enabled = 0; + } + +#define VEXTERN(x) \ + *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, rhel5_ ## x), #x) = &__ ## x; +#include "vextern.h" +#undef VEXTERN + return 0; + + oom: + printk("Cannot allocate vdso\n"); + vdso_enabled = 0; + return -ENOMEM; +} +__initcall(init_vdso_rhel5_vars); + struct linux_binprm; -/* Put the vdso above the (randomized) stack with another randomized offset. - This way there is no hole in the middle of address space. - To save memory make sure it is still in the same PTE as the stack top. - This doesn't give that many random bits */ +/* + * Put the vdso above the (randomized) stack with another randomized + * offset. This way there is no hole in the middle of address space. + * To save memory make sure it is still in the same PTE as the stack + * top. This doesn't give that many random bits. + * + * Note that this algorithm is imperfect: the distribution of the vdso + * start address within a PMD is biased toward the end. + * + * Only used for the 64-bit and x32 vdsos. + */ static unsigned long vdso_addr(unsigned long start, unsigned len) { unsigned long addr, end; unsigned offset; - end = (start + PMD_SIZE - 1) & PMD_MASK; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; if (end >= TASK_SIZE_MAX) end = TASK_SIZE_MAX; end -= len; - /* This loses some more bits than a modulo, but is cheaper */ - offset = get_random_int() & (PTRS_PER_PTE - 1); - addr = start + (offset << PAGE_SHIFT); - if (addr >= end) - addr = end; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + + /* + * Forcibly align the final address in case we have a hardware + * issue that requires alignment for performance reasons. + */ + addr = align_addr(addr, NULL, ALIGN_VDSO); + return addr; } /* Setup a VMA at program startup for the vsyscall page. Not called for compat tasks */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +int __arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, + unsigned long map_address, struct page ** vdso_pages, + unsigned vdso_size) { struct mm_struct *mm = current->mm; unsigned long addr; int ret; - if (!vdso_enabled) + if (!vdso_enabled && map_address == 0) { + current->mm->context.vdso = NULL; return 0; + } down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, vdso_size); + if (map_address) + addr = map_address; + else + addr = vdso_addr(mm->start_stack, vdso_size); addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; @@ -120,8 +207,7 @@ int arch_setup_additional_pages(struct l ret = install_special_mapping(mm, addr, vdso_size, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pages); if (ret) { current->mm->context.vdso = NULL; @@ -133,6 +219,135 @@ up_fail: return ret; } +static DEFINE_MUTEX(vdso_mutex); + +static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp, + unsigned long map_address) +{ + struct uts_namespace *uts_ns = current->nsproxy->uts_ns; + struct ve_struct *ve = get_exec_env(); + int i, n1, n2, n3, new_version; + struct page **new_pages, **p; + + /* + * For node or in case we've not changed UTS simply + * map preallocated original vDSO. + * + * In turn if we already allocated one for this UTS + * simply reuse it. It improves speed significantly. + */ + if (uts_ns == &init_uts_ns) + goto map_init_uts; + + /* + * Dirty lockless hack. Strictly speaking + * we need to return @p here if it's non-nil, + * but since there only one trasition possible + * { =0 ; !=0 } we simply return @uts_ns->vdso.pages + */ + p = ACCESS_ONCE(uts_ns->vdso.pages); + smp_read_barrier_depends(); + if (p) + goto map_uts; + + if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) { + /* + * If there were no changes on version simply reuse + * preallocated one. + */ + new_version = KERNEL_VERSION(n1, n2, n3); + if (new_version == LINUX_VERSION_CODE) + goto map_init_uts; + } else { + /* + * If admin is passed malformed string here + * lets warn him once but continue working + * not using vDSO virtualization at all. It's + * better than walk out with error. + */ + pr_warn_once("Wrong release uts name format detected." + " Ignoring vDSO virtualization.\n"); + goto map_init_uts; + } + + mutex_lock(&vdso_mutex); + if (uts_ns->vdso.pages) { + mutex_unlock(&vdso_mutex); + goto map_uts; + } + + uts_ns->vdso.nr_pages = init_uts_ns.vdso.nr_pages; + uts_ns->vdso.size = init_uts_ns.vdso.size; + uts_ns->vdso.version_off= init_uts_ns.vdso.version_off; + new_pages = kmalloc(sizeof(struct page *) * init_uts_ns.vdso.nr_pages, GFP_KERNEL); + if (!new_pages) { + pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid); + goto out_unlock; + } + + for (i = 0; i < uts_ns->vdso.nr_pages; i++) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) { + pr_err("Can't allocate page for VE %d\n", ve->veid); + for (; i > 0; i--) + put_page(new_pages[i - 1]); + kfree(new_pages); + goto out_unlock; + } + new_pages[i] = p; + copy_page(page_address(p), page_address(init_uts_ns.vdso.pages[i])); + } + + uts_ns->vdso.addr = vmap(new_pages, uts_ns->vdso.nr_pages, 0, PAGE_KERNEL); + if (!uts_ns->vdso.addr) { + pr_err("Can't map vDSO pages for VE %d\n", ve->veid); + for (i = 0; i < uts_ns->vdso.nr_pages; i++) + put_page(new_pages[i]); + kfree(new_pages); + goto out_unlock; + } + + *((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version; + smp_wmb(); + uts_ns->vdso.pages = new_pages; + mutex_unlock(&vdso_mutex); + + pr_debug("vDSO version transition %d -> %d for VE %d\n", + LINUX_VERSION_CODE, new_version, ve->veid); + +map_uts: + return __arch_setup_additional_pages(bprm, uses_interp, map_address, + uts_ns->vdso.pages, uts_ns->vdso.size); +map_init_uts: + return __arch_setup_additional_pages(bprm, uses_interp, map_address, + init_uts_ns.vdso.pages, init_uts_ns.vdso.size); +out_unlock: + mutex_unlock(&vdso_mutex); + return -ENOMEM; +} + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp, + unsigned long map_address) +{ + return uts_arch_setup_additional_pages(bprm, uses_interp, map_address); +} +EXPORT_SYMBOL(arch_setup_additional_pages); + +int arch_setup_additional_pages_rhel5(struct linux_binprm *bprm, int uses_interp, + unsigned long map_address) +{ + return __arch_setup_additional_pages(bprm, uses_interp, map_address, + vdso_rhel5_pages, vdso_rhel5_size); +} +EXPORT_SYMBOL(arch_setup_additional_pages_rhel5); + +int vdso_is_rhel5(struct page *page) +{ + return page == vdso_rhel5_pages[0]; +} +EXPORT_SYMBOL(vdso_is_rhel5); + static __init int vdso_setup(char *s) { vdso_enabled = simple_strtoul(s, NULL, 0); diff -uprN linux-2.6.32/arch/x86/video/fbdev.c linux-2.6.32.ovz/arch/x86/video/fbdev.c --- linux-2.6.32/arch/x86/video/fbdev.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/video/fbdev.c 2015-03-10 13:23:31.000000000 -0700 @@ -8,24 +8,33 @@ */ #include #include +#include int fb_is_primary_device(struct fb_info *info) { struct device *device = info->device; struct pci_dev *pci_dev = NULL; + struct pci_dev *default_device = vga_default_device(); struct resource *res = NULL; - int retval = 0; if (device) pci_dev = to_pci_dev(device); - if (pci_dev) - res = &pci_dev->resource[PCI_ROM_RESOURCE]; + if (!pci_dev) + return 0; + if (default_device) { + if (pci_dev == default_device) + return 1; + else + return 0; + } + + res = &pci_dev->resource[PCI_ROM_RESOURCE]; if (res && res->flags & IORESOURCE_ROM_SHADOW) - retval = 1; + return 1; - return retval; + return 0; } EXPORT_SYMBOL(fb_is_primary_device); MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/arch/x86/xen/enlighten.c linux-2.6.32.ovz/arch/x86/xen/enlighten.c --- linux-2.6.32/arch/x86/xen/enlighten.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/enlighten.c 2015-03-10 13:24:15.000000000 -0700 @@ -11,6 +11,7 @@ * Jeremy Fitzhardinge , XenSource Inc, 2007 */ +#include #include #include #include @@ -27,13 +28,19 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include +#include #include #include +#include #include #include @@ -48,10 +55,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include "xen-ops.h" #include "mmu.h" @@ -68,10 +78,14 @@ EXPORT_SYMBOL_GPL(xen_domain_type); struct start_info *xen_start_info; EXPORT_SYMBOL_GPL(xen_start_info); +static struct shared_info *shared_info_page; struct shared_info xen_dummy_shared_info; void *xen_initial_gdt; +__read_mostly int xen_have_vector_callback; +EXPORT_SYMBOL_GPL(xen_have_vector_callback); + /* * Point at some empty memory to start with. We map the real shared_info * page as soon as fixmap is up and running. @@ -93,6 +107,14 @@ struct shared_info *HYPERVISOR_shared_in */ static int have_vcpu_info_placement = 1; +static void clamp_max_cpus(void) +{ +#ifdef CONFIG_SMP + if (setup_max_cpus > MAX_VIRT_CPUS) + setup_max_cpus = MAX_VIRT_CPUS; +#endif +} + static void xen_vcpu_setup(int cpu) { struct vcpu_register_vcpu_info info; @@ -100,13 +122,32 @@ static void xen_vcpu_setup(int cpu) struct vcpu_info *vcpup; BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); - per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - if (!have_vcpu_info_placement) - return; /* already tested, not available */ + /* + * This path is called twice on PVHVM - first during bootup via + * smp_init -> xen_hvm_cpu_notify, and then if the VCPU is being + * hotplugged: cpu_up -> xen_hvm_cpu_notify. + * As we can only do the VCPUOP_register_vcpu_info once lets + * not over-write its result. + * + * For PV it is called during restore (xen_vcpu_restore) and bootup + * (xen_setup_vcpu_info_placement). The hotplug mechanism does not + * use this function. + */ + if (xen_hvm_domain()) { + if (per_cpu(xen_vcpu, cpu) == &per_cpu(xen_vcpu_info, cpu)) + return; + } + if (cpu < MAX_VIRT_CPUS) + per_cpu(xen_vcpu,cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; - vcpup = &per_cpu(xen_vcpu_info, cpu); + if (!have_vcpu_info_placement) { + if (cpu >= MAX_VIRT_CPUS) + clamp_max_cpus(); + return; + } + vcpup = &per_cpu(xen_vcpu_info, cpu); info.mfn = arbitrary_virt_to_mfn(vcpup); info.offset = offset_in_page(vcpup); @@ -121,6 +162,7 @@ static void xen_vcpu_setup(int cpu) if (err) { printk(KERN_DEBUG "register_vcpu_info failed: err=%d\n", err); have_vcpu_info_placement = 0; + clamp_max_cpus(); } else { /* This cpu is using the registered vcpu info, even if later ones fail to. */ @@ -138,24 +180,23 @@ static void xen_vcpu_setup(int cpu) */ void xen_vcpu_restore(void) { - if (have_vcpu_info_placement) { - int cpu; + int cpu; - for_each_online_cpu(cpu) { - bool other_cpu = (cpu != smp_processor_id()); + for_each_online_cpu(cpu) { + bool other_cpu = (cpu != smp_processor_id()); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) - BUG(); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) + BUG(); - xen_vcpu_setup(cpu); + xen_setup_runstate_info(cpu); - if (other_cpu && - HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) - BUG(); - } + if (have_vcpu_info_placement) + xen_vcpu_setup(cpu); - BUG_ON(!have_vcpu_info_placement); + if (other_cpu && + HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) + BUG(); } } @@ -172,8 +213,13 @@ static void __init xen_banner(void) xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } +#define CPUID_THERM_POWER_LEAF 6 +#define APERFMPERF_PRESENT 0 + static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; +static __read_mostly unsigned int cpuid_leaf81_edx_mask = ~0; +static __read_mostly unsigned int cpuid_leaf87_edx_mask = 0; static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) @@ -187,15 +233,32 @@ static void xen_cpuid(unsigned int *ax, * unsupported kernel subsystems as possible. */ switch (*ax) { - case 1: + case 0x1: maskecx = cpuid_leaf1_ecx_mask; maskedx = cpuid_leaf1_edx_mask; break; + case CPUID_MWAIT_LEAF: + *ax = *bx = *cx = *dx = 0; + return; + + case CPUID_THERM_POWER_LEAF: + /* Disabling APERFMPERF for kernel usage */ + maskecx = ~(1 << APERFMPERF_PRESENT); + break; + case 0xb: /* Suppress extended topology stuff */ maskebx = 0; break; + + case 0x80000001: + maskedx = cpuid_leaf81_edx_mask; + break; + + case 0x80000007: + maskedx = cpuid_leaf87_edx_mask; + break; } asm(XEN_EMULATE_PREFIX "cpuid" @@ -213,34 +276,35 @@ static void xen_cpuid(unsigned int *ax, static __init void xen_init_cpuid_mask(void) { unsigned int ax, bx, cx, dx; + unsigned int xsave_mask; + + cpuid_leaf1_ecx_mask = ~(1 << (X86_FEATURE_MWAIT % 32)); + cpuid_leaf81_edx_mask = ~(1 << (X86_FEATURE_GBPAGES % 32)); cpuid_leaf1_edx_mask = - ~((1 << X86_FEATURE_MCE) | /* disable MCE */ - (1 << X86_FEATURE_MCA) | /* disable MCA */ + ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */ (1 << X86_FEATURE_ACC)); /* thermal monitoring */ if (!xen_initial_domain()) cpuid_leaf1_edx_mask &= - ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ + ~((1 << X86_FEATURE_MCE) | /* disable MCE */ + (1 << X86_FEATURE_MCA) | /* disable MCA */ + (1 << X86_FEATURE_APIC) | /* disable local APIC */ (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_X2APIC % 32)); + ax = 1; cx = 0; xen_cpuid(&ax, &bx, &cx, &dx); - /* cpuid claims we support xsave; try enabling it to see what happens */ - if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { - unsigned long cr4; - - set_in_cr4(X86_CR4_OSXSAVE); - - cr4 = read_cr4(); - - if ((cr4 & X86_CR4_OSXSAVE) == 0) - cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); - - clear_in_cr4(X86_CR4_OSXSAVE); - } + xsave_mask = + (1 << (X86_FEATURE_XSAVE % 32)) | + (1 << (X86_FEATURE_OSXSAVE % 32)); + + /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ + if ((cx & xsave_mask) != xsave_mask) + cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ } static void xen_set_debugreg(int reg, unsigned long val) @@ -332,6 +396,24 @@ static void xen_set_ldt(const void *addr xen_mc_issue(PARAVIRT_LAZY_CPU); } +#ifdef CONFIG_X86_32 +static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + void *gdt; + xmaddr_t mgdt; + u64 descriptor; + struct desc_struct user_cs; + + gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS]; + mgdt = virt_to_machine(gdt); + + user_cs = mm->context.user_cs; + descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32; + + HYPERVISOR_update_descriptor(mgdt.maddr, descriptor); +} +#endif /*CONFIG_X86_32*/ + static void xen_load_gdt(const struct desc_ptr *dtr) { unsigned long va = dtr->address; @@ -728,7 +810,6 @@ static void set_xen_basic_apic_ops(void) #endif - static void xen_clts(void) { struct multicall_space mcs; @@ -924,7 +1005,7 @@ static const struct pv_init_ops xen_init }; static const struct pv_time_ops xen_time_ops __initdata = { - .sched_clock = xen_sched_clock, + .sched_clock = xen_clocksource_read, }; static const struct pv_cpu_ops xen_cpu_ops __initdata = { @@ -958,6 +1039,9 @@ static const struct pv_cpu_ops xen_cpu_o .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = xen_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, @@ -997,10 +1081,6 @@ static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; -#ifdef CONFIG_SMP - smp_send_stop(); -#endif - if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) BUG(); } @@ -1025,6 +1105,26 @@ static void xen_crash_shutdown(struct pt xen_reboot(SHUTDOWN_crash); } +static int +xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct sched_shutdown r = { .reason = SHUTDOWN_crash}; + + if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) + BUG(); + return NOTIFY_DONE; +} + +static struct notifier_block xen_panic_block = { + .notifier_call= xen_panic_event, +}; + +int xen_panic_handler_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &xen_panic_block); + return 0; +} + static const struct machine_ops __initdata xen_machine_ops = { .restart = xen_restart, .halt = xen_machine_halt, @@ -1093,6 +1193,12 @@ asmlinkage void __init xen_start_kernel( __supported_pte_mask |= _PAGE_IOMAP; + /* + * Prevent page tables from being allocated in highmem, even + * if CONFIG_HIGHPTE is enabled. + */ + __userpte_alloc_gfp &= ~__GFP_HIGHMEM; + #ifdef CONFIG_X86_64 /* Work out if we support NX */ check_efer(); @@ -1136,6 +1242,15 @@ asmlinkage void __init xen_start_kernel( xen_smp_init(); +#ifdef CONFIG_ACPI_NUMA + /* + * The pages we from Xen are not related to machine pages, so + * any NUMA information the kernel tries to get from ACPI will + * be meaningless. Prevent it from trying. + */ + acpi_numa = -1; +#endif + pgd = (pgd_t *)xen_start_info->pt_base; /* Don't do the full vcpu_info placement stuff until we have a @@ -1178,10 +1293,16 @@ asmlinkage void __init xen_start_kernel( add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + } else { + /* Make sure ACS will be enabled */ + pci_request_acs(); } + xen_raw_console_write("about to get started...\n"); + xen_setup_runstate_info(0); + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); @@ -1189,3 +1310,140 @@ asmlinkage void __init xen_start_kernel( x86_64_start_reservations((char *)__pa_symbol(&boot_params)); #endif } + +uint32_t xen_cpuid_base(void) +{ + uint32_t base, eax, ebx, ecx, edx; + char signature[13]; + + for (base = 0x40000000; base < 0x40010000; base += 0x100) { + cpuid(base, &eax, &ebx, &ecx, &edx); + *(uint32_t *)(signature + 0) = ebx; + *(uint32_t *)(signature + 4) = ecx; + *(uint32_t *)(signature + 8) = edx; + signature[12] = 0; + + if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2)) + return base; + } + + return 0; +} + +uint32_t xen_version(void) +{ + uint32_t eax, ebx, ecx, edx, base; + base = xen_cpuid_base(); + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + return eax; +} + +static int init_hvm_pv_info(int *major, int *minor) +{ + uint32_t eax, ebx, ecx, edx, pages, msr, base; + u64 pfn; + + base = xen_cpuid_base(); + if (!base) + return -EINVAL; + + cpuid(base + 1, &eax, &ebx, &ecx, &edx); + + *major = eax >> 16; + *minor = eax & 0xffff; + printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor); + + cpuid(base + 2, &pages, &msr, &ecx, &edx); + + pfn = __pa(hypercall_page); + wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32)); + + xen_setup_features(); + + pv_info = xen_info; + pv_info.kernel_rpl = 0; + + xen_domain_type = XEN_HVM_DOMAIN; + + return 0; +} + +void xen_hvm_init_shared_info(void) +{ + int cpu; + struct xen_add_to_physmap xatp; + + xatp.domid = DOMID_SELF; + xatp.idx = 0; + xatp.space = XENMAPSPACE_shared_info; + xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT; + if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) + BUG(); + + HYPERVISOR_shared_info = (struct shared_info *)shared_info_page; + + /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info + * page, we use it in the event channel upcall and in some pvclock + * related functions. We don't need the vcpu_info placement + * optimizations because we don't use any pv_mmu or pv_irq op on + * HVM. + * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is + * online but xen_hvm_init_shared_info is run at resume time too and + * in that case multiple vcpus might be online. */ + for_each_online_cpu(cpu) { + /* Leave it to be NULL. */ + if (cpu >= MAX_VIRT_CPUS) + continue; + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; + } +} + +static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + switch (action) { + case CPU_UP_PREPARE: + xen_vcpu_setup(cpu); + if (xen_have_vector_callback) + xen_init_lock_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { + .notifier_call = xen_hvm_cpu_notify, +}; + +void __init xen_hvm_guest_init(void) +{ + int r; + int major, minor; + + if (xen_pv_domain()) + return; + + if (is_kdump_kernel()) + return; + + r = init_hvm_pv_info(&major, &minor); + if (r < 0) + return; + + shared_info_page = (struct shared_info *) + alloc_bootmem_pages(PAGE_SIZE); + + xen_hvm_init_shared_info(); + + xen_panic_handler_init(); + + if (xen_feature(XENFEAT_hvm_callback_vector)) + xen_have_vector_callback = 1; + xen_hvm_smp_init(); + register_cpu_notifier(&xen_hvm_cpu_notifier); + xen_unplug_emulated_devices(); + x86_init.irqs.intr_init = xen_init_IRQ; +} diff -uprN linux-2.6.32/arch/x86/xen/Makefile linux-2.6.32.ovz/arch/x86/xen/Makefile --- linux-2.6.32/arch/x86/xen/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/Makefile 2015-03-10 13:21:39.000000000 -0700 @@ -12,7 +12,7 @@ CFLAGS_mmu.o := $(nostackp) obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm.o xen-asm_$(BITS).o \ - grant-table.o suspend.o + grant-table.o suspend.o platform-pci-unplug.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o diff -uprN linux-2.6.32/arch/x86/xen/mmu.c linux-2.6.32.ovz/arch/x86/xen/mmu.c --- linux-2.6.32/arch/x86/xen/mmu.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/mmu.c 2015-03-10 13:22:57.000000000 -0700 @@ -66,6 +66,12 @@ #define MMU_UPDATE_HISTO 30 +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and balloon lists. + */ +DEFINE_SPINLOCK(xen_reservation_lock); + #ifdef CONFIG_XEN_DEBUG_FS static struct { @@ -155,6 +161,7 @@ DEFINE_PER_CPU(unsigned long, xen_curren */ #define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) +unsigned long xen_max_p2m_pfn __read_mostly = MAX_DOMAIN_PAGES; #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) @@ -185,7 +192,7 @@ static inline unsigned p2m_index(unsigne } /* Build the parallel p2m_top_mfn structures */ -static void __init xen_build_mfn_list_list(void) +void xen_build_mfn_list_list(void) { unsigned pfn, idx; @@ -207,7 +214,7 @@ void xen_setup_mfn_list_list(void) HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = virt_to_mfn(p2m_top_mfn_list); - HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages; + HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn; } /* Set up p2m_top to point to the domain-builder provided p2m pages */ @@ -217,6 +224,8 @@ void __init xen_build_dynamic_phys_to_ma unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages); unsigned pfn; + xen_max_p2m_pfn = max_pfn; + for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) { unsigned topidx = p2m_top_index(pfn); @@ -230,7 +239,7 @@ unsigned long get_phys_to_machine(unsign { unsigned topidx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) + if (unlikely(pfn >= xen_max_p2m_pfn)) return INVALID_P2M_ENTRY; topidx = p2m_top_index(pfn); @@ -276,7 +285,12 @@ bool __set_phys_to_machine(unsigned long { unsigned topidx, idx; - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return true; + } + + if (unlikely(pfn >= xen_max_p2m_pfn)) { BUG_ON(mfn != INVALID_P2M_ENTRY); return true; } @@ -296,11 +310,6 @@ bool __set_phys_to_machine(unsigned long void set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { - BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); - return; - } - if (unlikely(!__set_phys_to_machine(pfn, mfn))) { alloc_p2m(pfn); @@ -516,7 +525,20 @@ static pteval_t pte_pfn_to_mfn(pteval_t if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; pteval_t flags = val & PTE_FLAGS_MASK; - val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; + unsigned long mfn = pfn_to_mfn(pfn); + + /* + * If there's no mfn for the pfn, then just create an + * empty non-present pte. Unfortunately this loses + * information about the original pfn, so + * pte_mfn_to_pfn is asymmetric. + */ + if (unlikely(mfn == INVALID_P2M_ENTRY)) { + mfn = 0; + flags = 0; + } + + val = ((pteval_t)mfn << PAGE_SHIFT) | flags; } return val; @@ -934,8 +956,6 @@ static int xen_pin_page(struct mm_struct read-only, and can be pinned. */ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { - vm_unmap_aliases(); - xen_mc_batch(); if (__xen_pgd_walk(mm, pgd, xen_pin_page, USER_LIMIT)) { @@ -987,10 +1007,9 @@ static void xen_pgd_pin(struct mm_struct */ void xen_mm_pin_all(void) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { @@ -999,7 +1018,7 @@ void xen_mm_pin_all(void) } } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } /* @@ -1100,10 +1119,9 @@ static void xen_pgd_unpin(struct mm_stru */ void xen_mm_unpin_all(void) { - unsigned long flags; struct page *page; - spin_lock_irqsave(&pgd_lock, flags); + spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { @@ -1113,7 +1131,7 @@ void xen_mm_unpin_all(void) } } - spin_unlock_irqrestore(&pgd_lock, flags); + spin_unlock(&pgd_lock); } void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -1141,7 +1159,7 @@ static void drop_other_mm_ref(void *info active_mm = percpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm) + if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure @@ -1293,7 +1311,7 @@ static void xen_flush_tlb_others(const s { struct { struct mmuext_op op; - DECLARE_BITMAP(mask, NR_CPUS); + DECLARE_BITMAP(mask, num_processors); } *args; struct multicall_space mcs; @@ -1427,23 +1445,6 @@ static void xen_pgd_free(struct mm_struc #endif } -#ifdef CONFIG_HIGHPTE -static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) -{ - pgprot_t prot = PAGE_KERNEL; - - if (PagePinned(page)) - prot = PAGE_KERNEL_RO; - - if (0 && PageHighMem(page)) - printk("mapping highpte %lx type %d prot %s\n", - page_to_pfn(page), type, - (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ"); - - return kmap_atomic_prot(page, type, prot); -} -#endif - #ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { @@ -1516,7 +1517,6 @@ static void xen_alloc_ptpage(struct mm_s if (PagePinned(virt_to_page(mm->pgd))) { SetPagePinned(page); - vm_unmap_aliases(); if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); if (level == PT_PTE && USE_SPLIT_PTLOCKS) @@ -1657,8 +1657,10 @@ static __init void xen_map_identity_earl for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { pte_t pte; +#ifdef CONFIG_X86_32 if (pfn > max_pfn_mapped) max_pfn_mapped = pfn; +#endif if (!pte_none(pte_page[pteidx])) continue; @@ -1703,6 +1705,12 @@ __init pgd_t *xen_setup_kernel_pagetable pud_t *l3; pmd_t *l2; + /* max_pfn_mapped is the last pfn mapped in the initial memory + * mappings. Considering that on Xen after the kernel mappings we + * have the mappings of some pages that don't exist in pfn space, we + * set max_pfn_mapped to the last real pfn mapped. */ + max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list)); + /* Zap identity mapping */ init_level4_pgt[0] = __pgd(0); @@ -1902,10 +1910,6 @@ static const struct pv_mmu_ops xen_mmu_o .alloc_pmd_clone = paravirt_nop, .release_pmd = xen_release_pmd_init, -#ifdef CONFIG_HIGHPTE - .kmap_atomic_pte = xen_kmap_atomic_pte, -#endif - #ifdef CONFIG_X86_64 .set_pte = xen_set_pte, #else diff -uprN linux-2.6.32/arch/x86/xen/multicalls.c linux-2.6.32.ovz/arch/x86/xen/multicalls.c --- linux-2.6.32/arch/x86/xen/multicalls.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/multicalls.c 2015-03-10 13:22:30.000000000 -0700 @@ -189,10 +189,10 @@ struct multicall_space __xen_mc_entry(si unsigned argidx = roundup(b->argidx, sizeof(u64)); BUG_ON(preemptible()); - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); if (b->mcidx == MC_BATCH || - (argidx + args) > MC_ARGS) { + (argidx + args) >= MC_ARGS) { mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); @@ -206,7 +206,7 @@ struct multicall_space __xen_mc_entry(si ret.args = &b->args[argidx]; b->argidx = argidx + args; - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); return ret; } @@ -216,7 +216,7 @@ struct multicall_space xen_mc_extend_arg struct multicall_space ret = { NULL, NULL }; BUG_ON(preemptible()); - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); if (b->mcidx == 0) return ret; @@ -224,14 +224,14 @@ struct multicall_space xen_mc_extend_arg if (b->entries[b->mcidx - 1].op != op) return ret; - if ((b->argidx + size) > MC_ARGS) + if ((b->argidx + size) >= MC_ARGS) return ret; ret.mc = &b->entries[b->mcidx - 1]; ret.args = &b->args[b->argidx]; b->argidx += size; - BUG_ON(b->argidx > MC_ARGS); + BUG_ON(b->argidx >= MC_ARGS); return ret; } diff -uprN linux-2.6.32/arch/x86/xen/platform-pci-unplug.c linux-2.6.32.ovz/arch/x86/xen/platform-pci-unplug.c --- linux-2.6.32/arch/x86/xen/platform-pci-unplug.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/platform-pci-unplug.c 2015-03-10 13:22:06.000000000 -0700 @@ -0,0 +1,156 @@ +/****************************************************************************** + * platform-pci-unplug.c + * + * Xen platform PCI device driver + * Copyright (c) 2010, Citrix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + */ + +#include + +#include +#include + +#include +#include + + +#define XEN_PLATFORM_ERR_MAGIC -1 +#define XEN_PLATFORM_ERR_PROTOCOL -2 +#define XEN_PLATFORM_ERR_BLACKLIST -3 + +/* result of xen_emul_unplug stored in this variable */ +int xen_platform_pci_unplug; +EXPORT_SYMBOL_GPL(xen_platform_pci_unplug); +static int xen_emul_unplug; + +static int __init check_platform_magic(void) +{ + short magic; + char protocol; + + magic = inw(XEN_IOPORT_MAGIC); + if (magic != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_INFO "Xen Platform PCI: unrecognised magic value\n"); + return XEN_PLATFORM_ERR_MAGIC; + } + + protocol = inb(XEN_IOPORT_PROTOVER); + + printk(KERN_DEBUG "Xen Platform PCI: I/O protocol version %d\n", + protocol); + + switch (protocol) { + case 1: + outw(XEN_IOPORT_LINUX_PRODNUM, XEN_IOPORT_PRODNUM); + outl(XEN_IOPORT_LINUX_DRVVER, XEN_IOPORT_DRVVER); + if (inw(XEN_IOPORT_MAGIC) != XEN_IOPORT_MAGIC_VAL) { + printk(KERN_INFO "Xen Platform: blacklisted by host\n"); + return XEN_PLATFORM_ERR_BLACKLIST; + } + break; + default: + printk(KERN_WARNING "Xen Platform PCI: unknown I/O protocol version"); + return XEN_PLATFORM_ERR_PROTOCOL; + } + + return 0; +} + +int xen_ide_unplug_unsupported = 1; +EXPORT_SYMBOL_GPL(xen_ide_unplug_unsupported); + +void xen_unplug_emulated_devices(void) +{ + int r; + + /* not valid unless in HVM case or sysadmin explicit told not to unplug */ + if (xen_emul_unplug & XEN_UNPLUG_NEVER) + return; + + /* check the version of the xen platform PCI device */ + r = check_platform_magic(); + + /* If the version matches enable the Xen platform PCI driver. + * Also enable the Xen platform PCI driver if the host does + * not support the unplug protocol (XEN_PLATFORM_ERR_MAGIC) + * but the user told us that unplugging is unnecessary, i.e., + * the user has set the disk spec to vbd & vif spec to type=netfront */ + if (r && !(r == XEN_PLATFORM_ERR_MAGIC && + (xen_emul_unplug & XEN_UNPLUG_UNNECESSARY))) + return; + /* Set the default value of xen_emul_unplug depending on whether or + * not the Xen PV frontends and the Xen platform PCI driver have + * been compiled for this kernel (modules or built-in are both OK). */ + if (!xen_emul_unplug) { + if (xen_must_unplug_nics()) { + printk(KERN_INFO "Netfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated NICs.\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + } + if (xen_must_unplug_disks()) { + printk(KERN_INFO "Blkfront and the Xen platform PCI driver have " + "been compiled for this kernel: unplug emulated disks.\n" + "You might have to change the root device\n" + "from /dev/hd[a-d] to /dev/xvd[a-d]\n" + "in your root= kernel command line option\n"); + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + } + } + /* Now unplug the emulated devices */ + if (!(xen_emul_unplug & XEN_UNPLUG_UNNECESSARY)) + outw(xen_emul_unplug, XEN_IOPORT_UNPLUG); + else + xen_ide_unplug_unsupported = 0; + + if (xen_emul_unplug & XEN_UNPLUG_ALL_IDE_DISKS) + xen_ide_unplug_unsupported = 0; + + xen_platform_pci_unplug = xen_emul_unplug; +} + +static int __init parse_xen_emul_unplug(char *arg) +{ + char *p, *q; + int l; + + for (p = arg; p; p = q) { + q = strchr(p, ','); + if (q) { + l = q - p; + q++; + } else { + l = strlen(p); + } + if (!strncmp(p, "all", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL; + else if (!strncmp(p, "ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_IDE_DISKS; + else if (!strncmp(p, "aux-ide-disks", l)) + xen_emul_unplug |= XEN_UNPLUG_AUX_IDE_DISKS; + else if (!strncmp(p, "nics", l)) + xen_emul_unplug |= XEN_UNPLUG_ALL_NICS; + else if (!strncmp(p, "unnecessary", l)) + xen_emul_unplug |= XEN_UNPLUG_UNNECESSARY; + else if (!strncmp(p, "never", l)) + xen_emul_unplug |= XEN_UNPLUG_NEVER; + else + printk(KERN_WARNING "unrecognised option '%s' " + "in parameter 'xen_emul_unplug'\n", p); + } + return 0; +} +early_param("xen_emul_unplug", parse_xen_emul_unplug); diff -uprN linux-2.6.32/arch/x86/xen/setup.c linux-2.6.32.ovz/arch/x86/xen/setup.c --- linux-2.6.32/arch/x86/xen/setup.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/setup.c 2015-03-10 13:22:47.000000000 -0700 @@ -19,7 +19,9 @@ #include #include +#include #include +#include #include #include "xen-ops.h" @@ -32,20 +34,179 @@ extern void xen_sysenter_target(void); extern void xen_syscall_target(void); extern void xen_syscall32_target(void); +/* Amount of extra memory space we add to the e820 ranges */ +phys_addr_t xen_extra_mem_start, xen_extra_mem_size; + +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + +static __init void xen_add_extra_mem(unsigned long pages) +{ + unsigned long pfn; + + u64 size = (u64)pages * PAGE_SIZE; + u64 extra_start = xen_extra_mem_start + xen_extra_mem_size; + + if (!pages) + return; + + e820_add_region(extra_start, size, E820_RAM); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + + reserve_early(extra_start, extra_start + size, "XEN EXTRA"); + + xen_extra_mem_size += size; + + xen_max_p2m_pfn = PFN_DOWN(extra_start + size); + + for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++) + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); +} + +static unsigned long __init xen_release_chunk(phys_addr_t start_addr, + phys_addr_t end_addr) +{ + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + unsigned long start, end; + unsigned long len = 0; + unsigned long pfn; + int ret; + + start = PFN_UP(start_addr); + end = PFN_DOWN(end_addr); + + if (end <= start) + return 0; + + printk(KERN_INFO "xen_release_chunk: looking at area pfn %lx-%lx: ", + start, end); + for(pfn = start; pfn < end; pfn++) { + unsigned long mfn = pfn_to_mfn(pfn); + + /* Make sure pfn exists to start with */ + if (mfn == INVALID_P2M_ENTRY || mfn_to_pfn(mfn) != pfn) + continue; + + set_xen_guest_handle(reservation.extent_start, &mfn); + reservation.nr_extents = 1; + + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", + start, end, ret); + if (ret == 1) { + __set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + len++; + } + } + printk(KERN_CONT "%ld pages freed\n", len); + + return len; +} + +static unsigned long __init xen_return_unused_memory(unsigned long max_pfn, + const struct e820map *e820) +{ + phys_addr_t max_addr = PFN_PHYS(max_pfn); + phys_addr_t last_end = 0; + unsigned long released = 0; + int i; + + for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { + phys_addr_t end = e820->map[i].addr; + end = min(max_addr, end); + + released += xen_release_chunk(last_end, end); + last_end = e820->map[i].addr + e820->map[i].size; + } + + if (last_end < max_addr) + released += xen_release_chunk(last_end, max_addr); + + printk(KERN_INFO "released %ld pages of unused memory\n", released); + return released; +} + +static unsigned long __init xen_get_max_pages(void) +{ + unsigned long max_pages = MAX_DOMAIN_PAGES; + domid_t domid = DOMID_SELF; + int ret; + + ret = HYPERVISOR_memory_op(XENMEM_maximum_reservation, &domid); + if (ret > 0) + max_pages = ret; + return min(max_pages, MAX_DOMAIN_PAGES); +} /** * machine_specific_memory_setup - Hook for machine specific memory setup. **/ - char * __init xen_memory_setup(void) { + static struct e820entry map[E820MAX] __initdata; + unsigned long max_pfn = xen_start_info->nr_pages; + unsigned long long mem_end; + int rc; + struct xen_memory_map memmap; + unsigned long extra_pages = 0; + unsigned long extra_limit; + int i; max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); + mem_end = PFN_PHYS(max_pfn); - e820.nr_map = 0; + memmap.nr_entries = E820MAX; + set_xen_guest_handle(memmap.buffer, map); - e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); + rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); + if (rc == -ENOSYS) { + memmap.nr_entries = 1; + map[0].addr = 0ULL; + map[0].size = mem_end; + /* 8MB slack (to balance backend allocations). */ + map[0].size += 8ULL << 20; + map[0].type = E820_RAM; + rc = 0; + } + BUG_ON(rc); + + e820.nr_map = 0; + xen_extra_mem_start = mem_end; + for (i = 0; i < memmap.nr_entries; i++) { + unsigned long long end = map[i].addr + map[i].size; + + if (map[i].type == E820_RAM && end > mem_end) { + /* RAM off the end - may be partially included */ + u64 delta = min(map[i].size, end - mem_end); + + map[i].size -= delta; + end -= delta; + + extra_pages += PFN_DOWN(delta); + } + + if (map[i].size > 0 && end > xen_extra_mem_start) + xen_extra_mem_start = end; + + /* Add region if any remains */ + if (map[i].size > 0) + e820_add_region(map[i].addr, map[i].size, map[i].type); + } /* * Even though this is normal, usable memory under Xen, reserve @@ -67,6 +228,37 @@ char * __init xen_memory_setup(void) sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); + extra_limit = xen_get_max_pages(); + if (max_pfn + extra_pages > extra_limit) { + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + } + + extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820); + + /* + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO + * factor the base size. On non-highmem systems, the base + * size is the full initial memory allocation; on highmem it + * is limited to the max size of lowmem, so that it doesn't + * get completely filled. + * + * In principle there could be a problem in lowmem systems if + * the initial memory is also very large with respect to + * lowmem, but we won't try to deal with that here. + */ + extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + max_pfn + extra_pages); + + if (extra_limit >= max_pfn) + extra_pages = extra_limit - max_pfn; + else + extra_pages = 0; + + xen_add_extra_mem(extra_pages); + return "Xen"; } @@ -156,6 +348,8 @@ void __init xen_arch_setup(void) struct physdev_set_iopl set_iopl; int rc; + xen_panic_handler_init(); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); diff -uprN linux-2.6.32/arch/x86/xen/smp.c linux-2.6.32.ovz/arch/x86/xen/smp.c --- linux-2.6.32/arch/x86/xen/smp.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/smp.c 2015-03-10 13:24:14.000000000 -0700 @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -35,10 +36,10 @@ cpumask_var_t xen_cpu_initialized_map; -static DEFINE_PER_CPU(int, resched_irq); -static DEFINE_PER_CPU(int, callfunc_irq); -static DEFINE_PER_CPU(int, callfuncsingle_irq); -static DEFINE_PER_CPU(int, debug_irq) = -1; +static DEFINE_PER_CPU(int, xen_resched_irq); +static DEFINE_PER_CPU(int, xen_callfunc_irq); +static DEFINE_PER_CPU(int, xen_callfuncsingle_irq); +static DEFINE_PER_CPU(int, xen_debug_irq) = -1; static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); @@ -103,7 +104,7 @@ static int xen_smp_intr_init(unsigned in NULL); if (rc < 0) goto fail; - per_cpu(resched_irq, cpu) = rc; + per_cpu(xen_resched_irq, cpu) = rc; callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, @@ -114,7 +115,7 @@ static int xen_smp_intr_init(unsigned in NULL); if (rc < 0) goto fail; - per_cpu(callfunc_irq, cpu) = rc; + per_cpu(xen_callfunc_irq, cpu) = rc; debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, @@ -122,7 +123,7 @@ static int xen_smp_intr_init(unsigned in debug_name, NULL); if (rc < 0) goto fail; - per_cpu(debug_irq, cpu) = rc; + per_cpu(xen_debug_irq, cpu) = rc; callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, @@ -133,19 +134,20 @@ static int xen_smp_intr_init(unsigned in NULL); if (rc < 0) goto fail; - per_cpu(callfuncsingle_irq, cpu) = rc; + per_cpu(xen_callfuncsingle_irq, cpu) = rc; return 0; fail: - if (per_cpu(resched_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); - if (per_cpu(callfunc_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); - if (per_cpu(debug_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); - if (per_cpu(callfuncsingle_irq, cpu) >= 0) - unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + if (per_cpu(xen_resched_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + if (per_cpu(xen_callfunc_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + if (per_cpu(xen_debug_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0) + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), + NULL); return rc; } @@ -178,11 +180,18 @@ static void __init xen_smp_prepare_boot_ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; + unsigned int i; xen_init_lock_cpu(0); smp_store_cpu_info(0); cpu_data(0).x86_max_cores = 1; + + for_each_possible_cpu(i) { + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); + } set_cpu_sibling_map(0); if (xen_smp_intr_init(0)) @@ -295,6 +304,7 @@ static int __cpuinit xen_cpu_up(unsigned (unsigned long)task_stack_page(idle) - KERNEL_STACK_OFFSET + THREAD_SIZE; #endif + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_init_lock_cpu(cpu); @@ -344,16 +354,17 @@ static int xen_cpu_disable(void) static void xen_cpu_die(unsigned int cpu) { - while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); } - unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); - unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL); xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); + if (xen_pv_domain()) + xen_teardown_timer(cpu); if (num_online_cpus() == 1) alternatives_smp_switch(0); @@ -395,9 +406,9 @@ static void stop_self(void *v) BUG(); } -static void xen_smp_send_stop(void) +static void xen_stop_other_cpus(int wait) { - smp_call_function(stop_self, NULL, 0); + smp_call_function(stop_self, NULL, wait); } static void xen_smp_send_reschedule(int cpu) @@ -465,7 +476,7 @@ static const struct smp_ops xen_smp_ops .cpu_disable = xen_cpu_disable, .play_dead = xen_play_dead, - .smp_send_stop = xen_smp_send_stop, + .stop_other_cpus = xen_stop_other_cpus, .smp_send_reschedule = xen_smp_send_reschedule, .send_call_func_ipi = xen_smp_send_call_function_ipi, @@ -478,3 +489,44 @@ void __init xen_smp_init(void) xen_fill_possible_map(); xen_init_spinlocks(); } + +static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) +{ + native_smp_prepare_cpus(max_cpus); + WARN_ON(xen_smp_intr_init(0)); + + xen_init_lock_cpu(0); +} + +static int __cpuinit xen_hvm_cpu_up(unsigned int cpu) +{ + int rc; + /* + * xen_smp_intr_init() needs to run before native_cpu_up() + * so that IPI vectors are set up on the booting CPU before + * it is marked online in native_cpu_up(). + */ + rc = xen_smp_intr_init(cpu); + WARN_ON(rc); + if (!rc) + rc = native_cpu_up(cpu); + return rc; +} + +static void xen_hvm_cpu_die(unsigned int cpu) +{ + xen_cpu_die(cpu); + native_cpu_die(cpu); +} + +void __init xen_hvm_smp_init(void) +{ + if (!xen_have_vector_callback) + return; + smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.cpu_up = xen_hvm_cpu_up; + smp_ops.cpu_die = xen_hvm_cpu_die; + smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; + smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; +} diff -uprN linux-2.6.32/arch/x86/xen/suspend.c linux-2.6.32.ovz/arch/x86/xen/suspend.c --- linux-2.6.32/arch/x86/xen/suspend.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/suspend.c 2015-03-10 13:24:03.000000000 -0700 @@ -1,4 +1,5 @@ #include +#include #include #include @@ -25,8 +26,17 @@ void xen_pre_suspend(void) BUG(); } +void xen_hvm_post_suspend(int suspend_cancelled) +{ + xen_hvm_init_shared_info(); + xen_callback_vector(); + xen_unplug_emulated_devices(); +} + void xen_post_suspend(int suspend_cancelled) { + xen_build_mfn_list_list(); + xen_setup_shared_info(); if (suspend_cancelled) { @@ -44,7 +54,19 @@ void xen_post_suspend(int suspend_cancel } +static void xen_vcpu_notify_restore(void *data) +{ + unsigned long reason = (unsigned long)data; + + /* Boot processor notified via generic timekeeping_resume() */ + if ( smp_processor_id() == 0) + return; + + clockevents_notify(reason, NULL); +} + void xen_arch_resume(void) { - /* nothing */ + on_each_cpu(xen_vcpu_notify_restore, + (void *)CLOCK_EVT_NOTIFY_RESUME, 1); } diff -uprN linux-2.6.32/arch/x86/xen/time.c linux-2.6.32.ovz/arch/x86/xen/time.c --- linux-2.6.32/arch/x86/xen/time.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/time.c 2015-03-10 13:24:14.000000000 -0700 @@ -31,14 +31,14 @@ #define NS_PER_TICK (1000000000LL / HZ) /* runstate info updated by Xen */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate); /* snapshots of runstate info */ -static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); +static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot); /* unused ns of stolen and blocked time */ -static DEFINE_PER_CPU(u64, residual_stolen); -static DEFINE_PER_CPU(u64, residual_blocked); +static DEFINE_PER_CPU(u64, xen_residual_stolen); +static DEFINE_PER_CPU(u64, xen_residual_blocked); /* return an consistent snapshot of 64-bit time/counter value */ static u64 get64(const u64 *p) @@ -79,7 +79,7 @@ static void get_runstate_snapshot(struct BUG_ON(preemptible()); - state = &__get_cpu_var(runstate); + state = &__get_cpu_var(xen_runstate); /* * The runstate info is always updated by the hypervisor on @@ -97,14 +97,14 @@ static void get_runstate_snapshot(struct /* return true when a vcpu could run but has no real cpu to run on */ bool xen_vcpu_stolen(int vcpu) { - return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; + return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable; } -static void setup_runstate_info(int cpu) +void xen_setup_runstate_info(int cpu) { struct vcpu_register_runstate_memory_area area; - area.addr.v = &per_cpu(runstate, cpu); + area.addr.v = &per_cpu(xen_runstate, cpu); if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, cpu, &area)) @@ -122,7 +122,7 @@ static void do_stolen_accounting(void) WARN_ON(state.state != RUNSTATE_running); - snap = &__get_cpu_var(runstate_snapshot); + snap = &__get_cpu_var(xen_runstate_snapshot); /* work out how much time the VCPU has not been runn*ing* */ blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; @@ -133,66 +133,27 @@ static void do_stolen_accounting(void) /* Add the appropriate number of ticks of stolen time, including any left-overs from last time. */ - stolen = runnable + offline + __get_cpu_var(residual_stolen); + stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); if (stolen < 0) stolen = 0; ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); - __get_cpu_var(residual_stolen) = stolen; + __get_cpu_var(xen_residual_stolen) = stolen; account_steal_ticks(ticks); /* Add the appropriate number of ticks of blocked time, including any left-overs from last time. */ - blocked += __get_cpu_var(residual_blocked); + blocked += __get_cpu_var(xen_residual_blocked); if (blocked < 0) blocked = 0; ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); - __get_cpu_var(residual_blocked) = blocked; + __get_cpu_var(xen_residual_blocked) = blocked; account_idle_ticks(ticks); } -/* - * Xen sched_clock implementation. Returns the number of unstolen - * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED - * states. - */ -unsigned long long xen_sched_clock(void) -{ - struct vcpu_runstate_info state; - cycle_t now; - u64 ret; - s64 offset; - - /* - * Ideally sched_clock should be called on a per-cpu basis - * anyway, so preempt should already be disabled, but that's - * not current practice at the moment. - */ - preempt_disable(); - - now = xen_clocksource_read(); - - get_runstate_snapshot(&state); - - WARN_ON(state.state != RUNSTATE_running); - - offset = now - state.state_entry_time; - if (offset < 0) - offset = 0; - - ret = state.time[RUNSTATE_blocked] + - state.time[RUNSTATE_running] + - offset; - - preempt_enable(); - - return ret; -} - - /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { @@ -434,7 +395,7 @@ void xen_setup_timer(int cpu) name = ""; irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, name, NULL); evt = &per_cpu(xen_clock_events, cpu); @@ -442,8 +403,6 @@ void xen_setup_timer(int cpu) evt->cpumask = cpumask_of(cpu); evt->irq = irq; - - setup_runstate_info(cpu); } void xen_teardown_timer(int cpu) @@ -465,6 +424,8 @@ void xen_timer_resume(void) { int cpu; + pvclock_resume(); + if (xen_clockevent != &xen_vcpuop_clockevent) return; @@ -477,6 +438,7 @@ void xen_timer_resume(void) __init void xen_time_init(void) { int cpu = smp_processor_id(); + struct timespec tp; clocksource_register(&xen_clocksource); @@ -488,12 +450,12 @@ __init void xen_time_init(void) } /* Set initial system time with full resolution */ - xen_read_wallclock(&xtime); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); + xen_read_wallclock(&tp); + do_settimeofday(&tp); setup_force_cpu_cap(X86_FEATURE_TSC); + xen_setup_runstate_info(cpu); xen_setup_timer(cpu); xen_setup_cpu_clockevents(); } diff -uprN linux-2.6.32/arch/x86/xen/xen-asm_32.S linux-2.6.32.ovz/arch/x86/xen/xen-asm_32.S --- linux-2.6.32/arch/x86/xen/xen-asm_32.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/xen-asm_32.S 2015-03-10 13:23:51.000000000 -0700 @@ -88,11 +88,11 @@ ENTRY(xen_iret) */ #ifdef CONFIG_SMP GET_THREAD_INFO(%eax) - movl TI_cpu(%eax), %eax - movl __per_cpu_offset(,%eax,4), %eax - mov per_cpu__xen_vcpu(%eax), %eax + movl %ss:TI_cpu(%eax), %eax + movl %ss:__per_cpu_offset(,%eax,4), %eax + mov %ss:per_cpu__xen_vcpu(%eax), %eax #else - movl per_cpu__xen_vcpu, %eax + movl %ss:per_cpu__xen_vcpu, %eax #endif /* check IF state we're restoring */ @@ -105,19 +105,21 @@ ENTRY(xen_iret) * resuming the code, so we don't have to be worried about * being preempted to another CPU. */ - setz XEN_vcpu_info_mask(%eax) + setz %ss:XEN_vcpu_info_mask(%eax) xen_iret_start_crit: /* check for unmasked and pending */ - cmpw $0x0001, XEN_vcpu_info_pending(%eax) + cmpw $0x0001, %ss:XEN_vcpu_info_pending(%eax) /* * If there's something pending, mask events again so we can - * jump back into xen_hypervisor_callback + * jump back into xen_hypervisor_callback. Otherwise do not + * touch XEN_vcpu_info_mask. */ - sete XEN_vcpu_info_mask(%eax) + jne 1f + movb $1, %ss:XEN_vcpu_info_mask(%eax) - popl %eax +1: popl %eax /* * From this point on the registers are restored and the stack diff -uprN linux-2.6.32/arch/x86/xen/xen-asm_64.S linux-2.6.32.ovz/arch/x86/xen/xen-asm_64.S --- linux-2.6.32/arch/x86/xen/xen-asm_64.S 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/xen-asm_64.S 2015-03-10 13:21:04.000000000 -0700 @@ -96,7 +96,7 @@ ENTRY(xen_sysret32) pushq $__USER32_CS pushq %rcx - pushq $VGCF_in_syscall + pushq $0 1: jmp hypercall_iret ENDPATCH(xen_sysret32) RELOC(xen_sysret32, 1b+1) @@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target) ENTRY(xen_sysenter_target) lea 16(%rsp), %rsp /* strip %rcx, %r11 */ mov $-ENOSYS, %rax - pushq $VGCF_in_syscall + pushq $0 jmp hypercall_iret ENDPROC(xen_syscall32_target) ENDPROC(xen_sysenter_target) diff -uprN linux-2.6.32/arch/x86/xen/xen-ops.h linux-2.6.32.ovz/arch/x86/xen/xen-ops.h --- linux-2.6.32/arch/x86/xen/xen-ops.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/x86/xen/xen-ops.h 2015-03-10 13:24:14.000000000 -0700 @@ -25,10 +25,12 @@ extern struct shared_info *HYPERVISOR_sh void xen_setup_mfn_list_list(void); void xen_setup_shared_info(void); +void xen_build_mfn_list_list(void); void xen_setup_machphys_mapping(void); pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); +extern unsigned long xen_max_p2m_pfn; char * __init xen_memory_setup(void); void __init xen_arch_setup(void); @@ -37,10 +39,15 @@ void xen_enable_sysenter(void); void xen_enable_syscall(void); void xen_vcpu_restore(void); +void xen_callback_vector(void); +void xen_hvm_init_shared_info(void); +void xen_unplug_emulated_devices(void); + void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_setup_runstate_info(int cpu); void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); @@ -58,10 +65,12 @@ void xen_setup_vcpu_info_placement(void) #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_hvm_smp_init(void); extern cpumask_var_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} +static inline void xen_hvm_smp_init(void) {} #endif #ifdef CONFIG_PARAVIRT_SPINLOCKS @@ -99,4 +108,6 @@ void xen_sysret32(void); void xen_sysret64(void); void xen_adjust_exception_frame(void); +extern int xen_panic_handler_init(void); + #endif /* XEN_OPS_H */ diff -uprN linux-2.6.32/arch/xtensa/include/asm/mman.h linux-2.6.32.ovz/arch/xtensa/include/asm/mman.h --- linux-2.6.32/arch/xtensa/include/asm/mman.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/xtensa/include/asm/mman.h 2015-06-23 04:16:16.000000000 -0700 @@ -83,6 +83,13 @@ #define MADV_MERGEABLE 12 /* KSM may merge identical pages */ #define MADV_UNMERGEABLE 13 /* KSM may not merge identical pages */ +#define MADV_HUGEPAGE 14 /* Worth backing with hugepages */ +#define MADV_NOHUGEPAGE 15 /* Not worth backing with hugepages */ + +#define MADV_DONTDUMP 16 /* Explicity exclude from the core dump, + overrides the coredump filter bits */ +#define MADV_DODUMP 17 /* Clear the MADV_NODUMP flag */ + /* compatibility flags */ #define MAP_FILE 0 diff -uprN linux-2.6.32/arch/xtensa/include/asm/socket.h linux-2.6.32.ovz/arch/xtensa/include/asm/socket.h --- linux-2.6.32/arch/xtensa/include/asm/socket.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/xtensa/include/asm/socket.h 2015-03-10 13:20:58.000000000 -0700 @@ -71,4 +71,5 @@ #define SO_PROTOCOL 38 #define SO_DOMAIN 39 +#define SO_RXQ_OVFL 40 #endif /* _XTENSA_SOCKET_H */ diff -uprN linux-2.6.32/arch/xtensa/include/asm/syscall.h linux-2.6.32.ovz/arch/xtensa/include/asm/syscall.h --- linux-2.6.32/arch/xtensa/include/asm/syscall.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/xtensa/include/asm/syscall.h 2015-03-10 13:21:11.000000000 -0700 @@ -13,8 +13,6 @@ struct sigaction; asmlinkage long xtensa_execve(char*, char**, char**, struct pt_regs*); asmlinkage long xtensa_clone(unsigned long, unsigned long, struct pt_regs*); asmlinkage long xtensa_pipe(int __user *); -asmlinkage long xtensa_mmap2(unsigned long, unsigned long, unsigned long, - unsigned long, unsigned long, unsigned long); asmlinkage long xtensa_ptrace(long, long, long, long); asmlinkage long xtensa_sigreturn(struct pt_regs*); asmlinkage long xtensa_rt_sigreturn(struct pt_regs*); diff -uprN linux-2.6.32/arch/xtensa/include/asm/unistd.h linux-2.6.32.ovz/arch/xtensa/include/asm/unistd.h --- linux-2.6.32/arch/xtensa/include/asm/unistd.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/xtensa/include/asm/unistd.h 2015-03-10 13:21:18.000000000 -0700 @@ -189,7 +189,7 @@ __SYSCALL( 79, sys_fremovexattr, 2) /* File Map / Shared Memory Operations */ #define __NR_mmap2 80 -__SYSCALL( 80, xtensa_mmap2, 6) +__SYSCALL( 80, sys_mmap_pgoff, 6) #define __NR_munmap 81 __SYSCALL( 81, sys_munmap, 2) #define __NR_mprotect 82 @@ -681,8 +681,10 @@ __SYSCALL(304, sys_signalfd, 3) __SYSCALL(305, sys_ni_syscall, 0) #define __NR_eventfd 306 __SYSCALL(306, sys_eventfd, 1) +#define __NR_recvmmsg 307 +__SYSCALL(307, sys_recvmmsg, 5) -#define __NR_syscall_count 307 +#define __NR_syscall_count 308 /* * sysxtensa syscall handler diff -uprN linux-2.6.32/arch/xtensa/kernel/syscall.c linux-2.6.32.ovz/arch/xtensa/kernel/syscall.c --- linux-2.6.32/arch/xtensa/kernel/syscall.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/arch/xtensa/kernel/syscall.c 2015-03-10 13:21:11.000000000 -0700 @@ -57,31 +57,6 @@ asmlinkage long xtensa_pipe(int __user * return error; } - -asmlinkage long xtensa_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - int error = -EBADF; - struct file * file = NULL; - - flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); - if (!(flags & MAP_ANONYMOUS)) { - file = fget(fd); - if (!file) - goto out; - } - - down_write(¤t->mm->mmap_sem); - error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); - - if (file) - fput(file); -out: - return error; -} - asmlinkage long xtensa_shmat(int shmid, char __user *shmaddr, int shmflg) { unsigned long ret; diff -uprN linux-2.6.32/block/as-iosched.c linux-2.6.32.ovz/block/as-iosched.c --- linux-2.6.32/block/as-iosched.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/as-iosched.c 2015-03-10 13:22:25.000000000 -0700 @@ -142,9 +142,9 @@ enum arq_state { AS_RQ_POSTSCHED, /* when they shouldn't be */ }; -#define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private) -#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private2) -#define RQ_SET_STATE(rq, state) ((rq)->elevator_private2 = (void *) state) +#define RQ_IOC(rq) ((struct io_context *) (rq)->elevator_private[0]) +#define RQ_STATE(rq) ((enum arq_state)(rq)->elevator_private[1]) +#define RQ_SET_STATE(rq, state) ((rq)->elevator_private[1] = (void *) state) static DEFINE_PER_CPU(unsigned long, as_ioc_count); static struct completion *ioc_gone; @@ -260,16 +260,6 @@ static void as_put_io_context(struct req */ #define RQ_RB_ROOT(ad, rq) (&(ad)->sort_list[rq_is_sync((rq))]) -static void as_add_rq_rb(struct as_data *ad, struct request *rq) -{ - struct request *alias; - - while ((unlikely(alias = elv_rb_add(RQ_RB_ROOT(ad, rq), rq)))) { - as_move_to_dispatch(ad, alias); - as_antic_stop(ad); - } -} - static inline void as_del_rq_rb(struct as_data *ad, struct request *rq) { elv_rb_del(RQ_RB_ROOT(ad, rq), rq); @@ -1192,14 +1182,14 @@ static void as_add_request(struct reques data_dir = rq_is_sync(rq); - rq->elevator_private = as_get_io_context(q->node); + rq->elevator_private[0] = as_get_io_context(q->node); if (RQ_IOC(rq)) { as_update_iohist(ad, RQ_IOC(rq)->aic, rq); atomic_inc(&RQ_IOC(rq)->aic->nr_queued); } - as_add_rq_rb(ad, rq); + elv_rb_add(RQ_RB_ROOT(ad, rq), rq); /* * set expire time and add to fifo list @@ -1270,7 +1260,7 @@ static void as_merged_request(struct req */ if (type == ELEVATOR_FRONT_MERGE) { as_del_rq_rb(ad, req); - as_add_rq_rb(ad, req); + elv_rb_add(RQ_RB_ROOT(ad, req), req); /* * Note! At this stage of this and the next function, our next * request may not be optimal - eg the request may have "grown" diff -uprN linux-2.6.32/block/blk-barrier.c linux-2.6.32.ovz/block/blk-barrier.c --- linux-2.6.32/block/blk-barrier.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-barrier.c 1969-12-31 16:00:00.000000000 -0800 @@ -1,445 +0,0 @@ -/* - * Functions related to barrier IO handling - */ -#include -#include -#include -#include - -#include "blk.h" - -/** - * blk_queue_ordered - does this queue support ordered writes - * @q: the request queue - * @ordered: one of QUEUE_ORDERED_* - * @prepare_flush_fn: rq setup helper for cache flush ordered writes - * - * Description: - * For journalled file systems, doing ordered writes on a commit - * block instead of explicitly doing wait_on_buffer (which is bad - * for performance) can be a big win. Block drivers supporting this - * feature should call this function and indicate so. - * - **/ -int blk_queue_ordered(struct request_queue *q, unsigned ordered, - prepare_flush_fn *prepare_flush_fn) -{ - if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH | - QUEUE_ORDERED_DO_POSTFLUSH))) { - printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__); - return -EINVAL; - } - - if (ordered != QUEUE_ORDERED_NONE && - ordered != QUEUE_ORDERED_DRAIN && - ordered != QUEUE_ORDERED_DRAIN_FLUSH && - ordered != QUEUE_ORDERED_DRAIN_FUA && - ordered != QUEUE_ORDERED_TAG && - ordered != QUEUE_ORDERED_TAG_FLUSH && - ordered != QUEUE_ORDERED_TAG_FUA) { - printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); - return -EINVAL; - } - - q->ordered = ordered; - q->next_ordered = ordered; - q->prepare_flush_fn = prepare_flush_fn; - - return 0; -} -EXPORT_SYMBOL(blk_queue_ordered); - -/* - * Cache flushing for ordered writes handling - */ -unsigned blk_ordered_cur_seq(struct request_queue *q) -{ - if (!q->ordseq) - return 0; - return 1 << ffz(q->ordseq); -} - -unsigned blk_ordered_req_seq(struct request *rq) -{ - struct request_queue *q = rq->q; - - BUG_ON(q->ordseq == 0); - - if (rq == &q->pre_flush_rq) - return QUEUE_ORDSEQ_PREFLUSH; - if (rq == &q->bar_rq) - return QUEUE_ORDSEQ_BAR; - if (rq == &q->post_flush_rq) - return QUEUE_ORDSEQ_POSTFLUSH; - - /* - * !fs requests don't need to follow barrier ordering. Always - * put them at the front. This fixes the following deadlock. - * - * http://thread.gmane.org/gmane.linux.kernel/537473 - */ - if (!blk_fs_request(rq)) - return QUEUE_ORDSEQ_DRAIN; - - if ((rq->cmd_flags & REQ_ORDERED_COLOR) == - (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR)) - return QUEUE_ORDSEQ_DRAIN; - else - return QUEUE_ORDSEQ_DONE; -} - -bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) -{ - struct request *rq; - - if (error && !q->orderr) - q->orderr = error; - - BUG_ON(q->ordseq & seq); - q->ordseq |= seq; - - if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE) - return false; - - /* - * Okay, sequence complete. - */ - q->ordseq = 0; - rq = q->orig_bar_rq; - __blk_end_request_all(rq, q->orderr); - return true; -} - -static void pre_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error); -} - -static void bar_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error); -} - -static void post_flush_end_io(struct request *rq, int error) -{ - elv_completed_request(rq->q, rq); - blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error); -} - -static void queue_flush(struct request_queue *q, unsigned which) -{ - struct request *rq; - rq_end_io_fn *end_io; - - if (which == QUEUE_ORDERED_DO_PREFLUSH) { - rq = &q->pre_flush_rq; - end_io = pre_flush_end_io; - } else { - rq = &q->post_flush_rq; - end_io = post_flush_end_io; - } - - blk_rq_init(q, rq); - rq->cmd_flags = REQ_HARDBARRIER; - rq->rq_disk = q->bar_rq.rq_disk; - rq->end_io = end_io; - q->prepare_flush_fn(q, rq); - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); -} - -static inline bool start_ordered(struct request_queue *q, struct request **rqp) -{ - struct request *rq = *rqp; - unsigned skip = 0; - - q->orderr = 0; - q->ordered = q->next_ordered; - q->ordseq |= QUEUE_ORDSEQ_STARTED; - - /* - * For an empty barrier, there's no actual BAR request, which - * in turn makes POSTFLUSH unnecessary. Mask them off. - */ - if (!blk_rq_sectors(rq)) { - q->ordered &= ~(QUEUE_ORDERED_DO_BAR | - QUEUE_ORDERED_DO_POSTFLUSH); - /* - * Empty barrier on a write-through device w/ ordered - * tag has no command to issue and without any command - * to issue, ordering by tag can't be used. Drain - * instead. - */ - if ((q->ordered & QUEUE_ORDERED_BY_TAG) && - !(q->ordered & QUEUE_ORDERED_DO_PREFLUSH)) { - q->ordered &= ~QUEUE_ORDERED_BY_TAG; - q->ordered |= QUEUE_ORDERED_BY_DRAIN; - } - } - - /* stash away the original request */ - blk_dequeue_request(rq); - q->orig_bar_rq = rq; - rq = NULL; - - /* - * Queue ordered sequence. As we stack them at the head, we - * need to queue in reverse order. Note that we rely on that - * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs - * request gets inbetween ordered sequence. - */ - if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) { - queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH); - rq = &q->post_flush_rq; - } else - skip |= QUEUE_ORDSEQ_POSTFLUSH; - - if (q->ordered & QUEUE_ORDERED_DO_BAR) { - rq = &q->bar_rq; - - /* initialize proxy request and queue it */ - blk_rq_init(q, rq); - if (bio_data_dir(q->orig_bar_rq->bio) == WRITE) - rq->cmd_flags |= REQ_RW; - if (q->ordered & QUEUE_ORDERED_DO_FUA) - rq->cmd_flags |= REQ_FUA; - init_request_from_bio(rq, q->orig_bar_rq->bio); - rq->end_io = bar_end_io; - - elv_insert(q, rq, ELEVATOR_INSERT_FRONT); - } else - skip |= QUEUE_ORDSEQ_BAR; - - if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) { - queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH); - rq = &q->pre_flush_rq; - } else - skip |= QUEUE_ORDSEQ_PREFLUSH; - - if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q)) - rq = NULL; - else - skip |= QUEUE_ORDSEQ_DRAIN; - - *rqp = rq; - - /* - * Complete skipped sequences. If whole sequence is complete, - * return false to tell elevator that this request is gone. - */ - return !blk_ordered_complete_seq(q, skip, 0); -} - -bool blk_do_ordered(struct request_queue *q, struct request **rqp) -{ - struct request *rq = *rqp; - const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq); - - if (!q->ordseq) { - if (!is_barrier) - return true; - - if (q->next_ordered != QUEUE_ORDERED_NONE) - return start_ordered(q, rqp); - else { - /* - * Queue ordering not supported. Terminate - * with prejudice. - */ - blk_dequeue_request(rq); - __blk_end_request_all(rq, -EOPNOTSUPP); - *rqp = NULL; - return false; - } - } - - /* - * Ordered sequence in progress - */ - - /* Special requests are not subject to ordering rules. */ - if (!blk_fs_request(rq) && - rq != &q->pre_flush_rq && rq != &q->post_flush_rq) - return true; - - if (q->ordered & QUEUE_ORDERED_BY_TAG) { - /* Ordered by tag. Blocking the next barrier is enough. */ - if (is_barrier && rq != &q->bar_rq) - *rqp = NULL; - } else { - /* Ordered by draining. Wait for turn. */ - WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q)); - if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q)) - *rqp = NULL; - } - - return true; -} - -static void bio_end_empty_barrier(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - - complete(bio->bi_private); -} - -/** - * blkdev_issue_flush - queue a flush - * @bdev: blockdev to issue flush for - * @error_sector: error sector - * - * Description: - * Issue a flush for the block device in question. Caller can supply - * room for storing the error offset in case of a flush error, if they - * wish to. - */ -int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q; - struct bio *bio; - int ret; - - if (bdev->bd_disk == NULL) - return -ENXIO; - - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - bio = bio_alloc(GFP_KERNEL, 0); - bio->bi_end_io = bio_end_empty_barrier; - bio->bi_private = &wait; - bio->bi_bdev = bdev; - submit_bio(WRITE_BARRIER, bio); - - wait_for_completion(&wait); - - /* - * The driver must store the error location in ->bi_sector, if - * it supports it. For non-stacked drivers, this should be copied - * from blk_rq_pos(rq). - */ - if (error_sector) - *error_sector = bio->bi_sector; - - ret = 0; - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - - bio_put(bio); - return ret; -} -EXPORT_SYMBOL(blkdev_issue_flush); - -static void blkdev_discard_end_io(struct bio *bio, int err) -{ - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - clear_bit(BIO_UPTODATE, &bio->bi_flags); - } - - if (bio->bi_private) - complete(bio->bi_private); - __free_page(bio_page(bio)); - - bio_put(bio); -} - -/** - * blkdev_issue_discard - queue a discard - * @bdev: blockdev to issue discard for - * @sector: start sector - * @nr_sects: number of sectors to discard - * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: DISCARD_FL_* flags to control behaviour - * - * Description: - * Issue a discard request for the sectors in question. - */ -int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags) -{ - DECLARE_COMPLETION_ONSTACK(wait); - struct request_queue *q = bdev_get_queue(bdev); - int type = flags & DISCARD_FL_BARRIER ? - DISCARD_BARRIER : DISCARD_NOBARRIER; - struct bio *bio; - struct page *page; - int ret = 0; - - if (!q) - return -ENXIO; - - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - - while (nr_sects && !ret) { - unsigned int sector_size = q->limits.logical_block_size; - unsigned int max_discard_sectors = - min(q->limits.max_discard_sectors, UINT_MAX >> 9); - - bio = bio_alloc(gfp_mask, 1); - if (!bio) - goto out; - bio->bi_sector = sector; - bio->bi_end_io = blkdev_discard_end_io; - bio->bi_bdev = bdev; - if (flags & DISCARD_FL_WAIT) - bio->bi_private = &wait; - - /* - * Add a zeroed one-sector payload as that's what - * our current implementations need. If we'll ever need - * more the interface will need revisiting. - */ - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) - goto out_free_bio; - if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) - goto out_free_page; - - /* - * And override the bio size - the way discard works we - * touch many more blocks on disk than the actual payload - * length. - */ - if (nr_sects > max_discard_sectors) { - bio->bi_size = max_discard_sectors << 9; - nr_sects -= max_discard_sectors; - sector += max_discard_sectors; - } else { - bio->bi_size = nr_sects << 9; - nr_sects = 0; - } - - bio_get(bio); - submit_bio(type, bio); - - if (flags & DISCARD_FL_WAIT) - wait_for_completion(&wait); - - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - else if (!bio_flagged(bio, BIO_UPTODATE)) - ret = -EIO; - bio_put(bio); - } - return ret; -out_free_page: - __free_page(page); -out_free_bio: - bio_put(bio); -out: - return -ENOMEM; -} -EXPORT_SYMBOL(blkdev_issue_discard); diff -uprN linux-2.6.32/block/blk-cgroup.c linux-2.6.32.ovz/block/blk-cgroup.c --- linux-2.6.32/block/blk-cgroup.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-cgroup.c 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,1710 @@ +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ +#include +#include +#include +#include +#include +#include +#include "blk-cgroup.h" +#include + +#define MAX_KEY_LEN 100 + +static DEFINE_SPINLOCK(blkio_list_lock); +static LIST_HEAD(blkio_list); + +/* for encoding cft->private value on file */ +#define BLKIOFILE_PRIVATE(x, val) (((x) << 16) | (val)) +/* What policy owns the file, proportional or throttle */ +#define BLKIOFILE_POLICY(val) (((val) >> 16) & 0xffff) +#define BLKIOFILE_ATTR(val) ((val) & 0xffff) + +struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; +EXPORT_SYMBOL_GPL(blkio_root_cgroup); + +static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) +{ + list_add(&pn->node, &blkcg->policy_list); +} + +static inline bool cftype_blkg_same_policy(struct cftype *cft, + struct blkio_group *blkg) +{ + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + + if (blkg->plid == plid) + return 1; + + return 0; +} + +/* Determines if policy node matches cgroup file being accessed */ +static inline bool pn_matches_cftype(struct cftype *cft, + struct blkio_policy_node *pn) +{ + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); + + return (plid == pn->plid && fileid == pn->fileid); +} + +/* Must be called with blkcg->lock held */ +static inline void blkio_policy_delete_node(struct blkio_policy_node *pn) +{ + list_del(&pn->node); +} + +/* Must be called with blkcg->lock held */ +static struct blkio_policy_node * +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev, + enum blkio_policy_id plid, int fileid) +{ + struct blkio_policy_node *pn; + + list_for_each_entry(pn, &blkcg->policy_list, node) { + if (pn->dev == dev && pn->plid == plid && pn->fileid == fileid) + return pn; + } + + return NULL; +} + +struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) +{ + return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + struct blkio_cgroup, css); +} +EXPORT_SYMBOL_GPL(cgroup_to_blkio_cgroup); + +struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, blkio_subsys_id), + struct blkio_cgroup, css); +} +EXPORT_SYMBOL_GPL(task_blkio_cgroup); + +static inline void +blkio_update_group_weight(struct blkio_group *blkg, unsigned int weight) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + if (blkiop->ops.blkio_update_group_weight_fn) + blkiop->ops.blkio_update_group_weight_fn(blkg->key, + blkg, weight); + } +} + +static inline void blkio_update_group_bps(struct blkio_group *blkg, u64 bps, + int fileid) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_bps_device + && blkiop->ops.blkio_update_group_read_bps_fn) + blkiop->ops.blkio_update_group_read_bps_fn(blkg->key, + blkg, bps); + + if (fileid == BLKIO_THROTL_write_bps_device + && blkiop->ops.blkio_update_group_write_bps_fn) + blkiop->ops.blkio_update_group_write_bps_fn(blkg->key, + blkg, bps); + } +} + +static inline void blkio_update_group_iops(struct blkio_group *blkg, + unsigned int iops, int fileid) +{ + struct blkio_policy_type *blkiop; + + list_for_each_entry(blkiop, &blkio_list, list) { + + /* If this policy does not own the blkg, do not send updates */ + if (blkiop->plid != blkg->plid) + continue; + + if (fileid == BLKIO_THROTL_read_iops_device + && blkiop->ops.blkio_update_group_read_iops_fn) + blkiop->ops.blkio_update_group_read_iops_fn(blkg->key, + blkg, iops); + + if (fileid == BLKIO_THROTL_write_iops_device + && blkiop->ops.blkio_update_group_write_iops_fn) + blkiop->ops.blkio_update_group_write_iops_fn(blkg->key, + blkg,iops); + } +} + +/* + * Add to the appropriate stat variable depending on the request type. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_add_stat(uint64_t *stat, uint64_t add, bool direction, + bool sync) +{ + if (direction) + stat[BLKIO_STAT_WRITE] += add; + else + stat[BLKIO_STAT_READ] += add; + if (sync) + stat[BLKIO_STAT_SYNC] += add; + else + stat[BLKIO_STAT_ASYNC] += add; +} + +static void blkio_max_stat(uint64_t *stat, uint64_t cur, bool direction, + bool sync) +{ + if (direction) + stat[BLKIO_STAT_WRITE] = max(stat[BLKIO_STAT_WRITE], cur); + else + stat[BLKIO_STAT_READ] = max(stat[BLKIO_STAT_READ], cur); + if (sync) + stat[BLKIO_STAT_SYNC] = max(stat[BLKIO_STAT_SYNC], cur); + else + stat[BLKIO_STAT_ASYNC] = max(stat[BLKIO_STAT_ASYNC], cur); +} + +/* + * Decrements the appropriate stat variable if non-zero depending on the + * request type. Panics on value being zero. + * This should be called with the blkg->stats_lock held. + */ +static void blkio_check_and_dec_stat(uint64_t *stat, bool direction, bool sync) +{ + if (direction) { + BUG_ON(stat[BLKIO_STAT_WRITE] == 0); + stat[BLKIO_STAT_WRITE]--; + } else { + BUG_ON(stat[BLKIO_STAT_READ] == 0); + stat[BLKIO_STAT_READ]--; + } + if (sync) { + BUG_ON(stat[BLKIO_STAT_SYNC] == 0); + stat[BLKIO_STAT_SYNC]--; + } else { + BUG_ON(stat[BLKIO_STAT_ASYNC] == 0); + stat[BLKIO_STAT_ASYNC]--; + } +} + +#ifdef CONFIG_DEBUG_BLK_CGROUP +/* This should be called with the blkg->stats_lock held. */ +static void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) +{ + if (blkio_blkg_waiting(&blkg->stats)) + return; + if (blkg == curr_blkg) + return; + blkg->stats.start_group_wait_time = sched_clock(); + blkio_mark_blkg_waiting(&blkg->stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_update_group_wait_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_waiting(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_group_wait_time)) + stats->group_wait_time += now - stats->start_group_wait_time; + blkio_clear_blkg_waiting(stats); +} + +/* This should be called with the blkg->stats_lock held. */ +static void blkio_end_empty_time(struct blkio_group_stats *stats) +{ + unsigned long long now; + + if (!blkio_blkg_empty(stats)) + return; + + now = sched_clock(); + if (time_after64(now, stats->start_empty_time)) + stats->empty_time += now - stats->start_empty_time; + blkio_clear_blkg_empty(stats); +} + +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + BUG_ON(blkio_blkg_idling(&blkg->stats)); + blkg->stats.start_idle_time = sched_clock(); + blkio_mark_blkg_idling(&blkg->stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_set_idle_time_stats); + +void blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + unsigned long flags; + unsigned long long now; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (blkio_blkg_idling(stats)) { + now = sched_clock(); + if (time_after64(now, stats->start_idle_time)) + stats->idle_time += now - stats->start_idle_time; + blkio_clear_blkg_idling(stats); + } + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_idle_time_stats); + +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + stats->avg_queue_size_sum += + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] + + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]; + stats->avg_queue_size_samples++; + blkio_update_group_wait_time(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_avg_queue_size_stats); + +void blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + unsigned long flags; + struct blkio_group_stats *stats; + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + + if (stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_READ] || + stats->stat_arr[BLKIO_STAT_QUEUED][BLKIO_STAT_WRITE]) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + /* + * group is already marked empty. This can happen if cfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if(blkio_blkg_empty(stats)) { + spin_unlock_irqrestore(&blkg->stats_lock, flags); + return; + } + + stats->start_empty_time = sched_clock(); + blkio_mark_blkg_empty(stats); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_set_start_empty_time); + +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkg->stats.dequeue += dequeue; +} +EXPORT_SYMBOL_GPL(blkiocg_update_dequeue_stats); +#else +static inline void blkio_set_start_group_wait_time(struct blkio_group *blkg, + struct blkio_group *curr_blkg) {} +static inline void blkio_end_empty_time(struct blkio_group_stats *stats) {} +#endif + +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], 1, direction, + sync); + blkio_end_empty_time(&blkg->stats); + blkio_set_start_group_wait_time(blkg, curr_blkg); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_add_stats); + +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_check_and_dec_stat(blkg->stats.stat_arr[BLKIO_STAT_QUEUED], + direction, sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_remove_stats); + +void blkiocg_update_timeslice_used(struct blkio_group *blkg, unsigned long time) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkg->stats.time += time; + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_timeslice_used); + +/* + * should be called under rcu read lock or queue lock to make sure blkg pointer + * is valid. + */ +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + struct blkio_group_stats_cpu *stats_cpu; + unsigned long flags; + + /* + * Disabling interrupts to provide mutual exclusion between two + * writes on same cpu. It probably is not needed for 64bit. Not + * optimizing that case yet. + */ + local_irq_save(flags); + + stats_cpu = this_cpu_ptr(blkg->stats_cpu); + + u64_stats_update_begin(&stats_cpu->syncp); + stats_cpu->sectors += bytes >> 9; + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICED], + 1, direction, sync); + blkio_add_stat(stats_cpu->stat_arr_cpu[BLKIO_STAT_CPU_SERVICE_BYTES], + bytes, direction, sync); + u64_stats_update_end(&stats_cpu->syncp); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_dispatch_stats); + +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +{ + struct blkio_group_stats *stats; + unsigned long flags; + unsigned long long now = sched_clock(); + + spin_lock_irqsave(&blkg->stats_lock, flags); + stats = &blkg->stats; + if (time_after64(now, io_start_time)) + blkio_add_stat(stats->stat_arr[BLKIO_STAT_SERVICE_TIME], + now - io_start_time, direction, sync); + if (time_after64(io_start_time, start_time)) { + blkio_add_stat(stats->stat_arr[BLKIO_STAT_WAIT_TIME], + io_start_time - start_time, direction, sync); + blkio_max_stat(stats->stat_arr[BLKIO_STAT_WAIT_MAX], + io_start_time - start_time, direction, sync); + } + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_completion_stats); + +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync) +{ + unsigned long flags; + + spin_lock_irqsave(&blkg->stats_lock, flags); + blkio_add_stat(blkg->stats.stat_arr[BLKIO_STAT_MERGED], 1, direction, + sync); + spin_unlock_irqrestore(&blkg->stats_lock, flags); +} +EXPORT_SYMBOL_GPL(blkiocg_update_io_merged_stats); + +static LIST_HEAD(stats_alloc_list); +static DEFINE_SPINLOCK(stats_alloc_lock); + +static void blkio_stats_alloc_fn(struct work_struct *work) +{ + struct delayed_work *dw = container_of(work, struct delayed_work, work); + struct blkio_group_stats_cpu __percpu *stats; + struct blkio_group *blkg; + + spin_lock_irq(&stats_alloc_lock); + while (!list_empty(&stats_alloc_list)) { + spin_unlock_irq(&stats_alloc_lock); + + stats = alloc_percpu(struct blkio_group_stats_cpu); + if (!stats) { + /* Cannot fail, try again after timeout */ + schedule_delayed_work(dw, HZ); + return; + } + + spin_lock_irq(&stats_alloc_lock); + if (list_empty(&stats_alloc_list)) { + free_percpu(stats); + break; + } + blkg = list_first_entry(&stats_alloc_list, + struct blkio_group, stats_alloc_list); + list_del_init(&blkg->stats_alloc_list); + blkg->stats_cpu = stats; + } + spin_unlock_irq(&stats_alloc_lock); +} + +static DECLARE_DELAYED_WORK(stats_alloc_work, blkio_stats_alloc_fn); +static DEFINE_PER_CPU(struct blkio_group_stats_cpu, stats_plug); + +int blkio_alloc_blkg_stats(struct blkio_group *blkg) +{ + unsigned long flags; + + /* Set temporary plug */ + blkg->stats_cpu = &per_cpu_var(stats_plug); + + /* Queue per cpu stat allocation from worker thread. */ + spin_lock_irqsave(&stats_alloc_lock, flags); + list_add(&blkg->stats_alloc_list, &stats_alloc_list); + spin_unlock_irqrestore(&stats_alloc_lock, flags); + + schedule_delayed_work(&stats_alloc_work, 0); + return 0; +} +EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats); + +void blkio_free_blkg_stats(struct blkio_group *blkg) +{ + unsigned long flags; + + if (!blkg->stats_cpu) + return; + + /* Cancel pending stats allocation */ + spin_lock_irqsave(&stats_alloc_lock, flags); + list_del_init(&blkg->stats_alloc_list); + spin_unlock_irqrestore(&stats_alloc_lock, flags); + + if (blkg->stats_cpu != &per_cpu_var(stats_plug)) + free_percpu(blkg->stats_cpu); +} +EXPORT_SYMBOL_GPL(blkio_free_blkg_stats); + +void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) +{ + unsigned long flags; + + spin_lock_irqsave(&blkcg->lock, flags); + spin_lock_init(&blkg->stats_lock); + rcu_assign_pointer(blkg->key, key); + blkg->blkcg_id = css_id(&blkcg->css); + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + blkg->plid = plid; + blkg->blk_ub = blkcg->blk_ub; + spin_unlock_irqrestore(&blkcg->lock, flags); + /* Need to take css reference ? */ + cgroup_path(blkcg->css.cgroup, blkg->path, sizeof(blkg->path)); + blkg->dev = dev; +} +EXPORT_SYMBOL_GPL(blkiocg_add_blkio_group); + +static void __blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + hlist_del_init_rcu(&blkg->blkcg_node); + blkg->blkcg_id = 0; +} + +/* + * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1 + * indicating that blk_group was unhashed by the time we got to it. + */ +int blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + struct blkio_cgroup *blkcg; + unsigned long flags; + struct cgroup_subsys_state *css; + int ret = 1; + + rcu_read_lock(); + css = css_lookup(&blkio_subsys, blkg->blkcg_id); + if (css) { + blkcg = container_of(css, struct blkio_cgroup, css); + spin_lock_irqsave(&blkcg->lock, flags); + if (!hlist_unhashed(&blkg->blkcg_node)) { + __blkiocg_del_blkio_group(blkg); + ret = 0; + } + spin_unlock_irqrestore(&blkcg->lock, flags); + } + + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(blkiocg_del_blkio_group); + +/* called under rcu_read_lock(). */ +struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) +{ + struct blkio_group *blkg; + struct hlist_node *n; + void *__key; + + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + __key = blkg->key; + if (__key == key) + return blkg; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(blkiocg_lookup_group); + +static void blkio_reset_stats_cpu(struct blkio_group *blkg) +{ + struct blkio_group_stats_cpu *stats_cpu; + int i, j, k; + /* + * Note: On 64 bit arch this should not be an issue. This has the + * possibility of returning some inconsistent value on 32bit arch + * as 64bit update on 32bit is non atomic. Taking care of this + * corner case makes code very complicated, like sending IPIs to + * cpus, taking care of stats of offline cpus etc. + * + * reset stats is anyway more of a debug feature and this sounds a + * corner case. So I am not complicating the code yet until and + * unless this becomes a real issue. + */ + for_each_possible_cpu(i) { + stats_cpu = per_cpu_ptr(blkg->stats_cpu, i); + stats_cpu->sectors = 0; + for(j = 0; j < BLKIO_STAT_CPU_NR; j++) + for (k = 0; k < BLKIO_STAT_TOTAL; k++) + stats_cpu->stat_arr_cpu[j][k] = 0; + } +} + +static int +blkiocg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, u64 val) +{ + struct blkio_cgroup *blkcg; + struct blkio_group *blkg; + struct blkio_group_stats *stats; + struct hlist_node *n; + uint64_t queued[BLKIO_STAT_TOTAL]; + int i; +#ifdef CONFIG_DEBUG_BLK_CGROUP + bool idling, waiting, empty; + unsigned long long now = sched_clock(); +#endif + + blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock_irq(&blkcg->lock); + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + spin_lock(&blkg->stats_lock); + stats = &blkg->stats; +#ifdef CONFIG_DEBUG_BLK_CGROUP + idling = blkio_blkg_idling(stats); + waiting = blkio_blkg_waiting(stats); + empty = blkio_blkg_empty(stats); +#endif + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + queued[i] = stats->stat_arr[BLKIO_STAT_QUEUED][i]; + memset(stats, 0, sizeof(struct blkio_group_stats)); + for (i = 0; i < BLKIO_STAT_TOTAL; i++) + stats->stat_arr[BLKIO_STAT_QUEUED][i] = queued[i]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (idling) { + blkio_mark_blkg_idling(stats); + stats->start_idle_time = now; + } + if (waiting) { + blkio_mark_blkg_waiting(stats); + stats->start_group_wait_time = now; + } + if (empty) { + blkio_mark_blkg_empty(stats); + stats->start_empty_time = now; + } +#endif + spin_unlock(&blkg->stats_lock); + + /* Reset Per cpu stats which don't take blkg->stats_lock */ + blkio_reset_stats_cpu(blkg); + } + + spin_unlock_irq(&blkcg->lock); + return 0; +} + +static void blkio_get_key_name(enum stat_sub_type type, dev_t dev, char *str, + int chars_left, bool diskname_only) +{ + snprintf(str, chars_left, "%d:%d", MAJOR(dev), MINOR(dev)); + chars_left -= strlen(str); + if (chars_left <= 0) { + printk(KERN_WARNING + "Possibly incorrect cgroup stat display format"); + return; + } + if (diskname_only) + return; + switch (type) { + case BLKIO_STAT_READ: + strlcat(str, " Read", chars_left); + break; + case BLKIO_STAT_WRITE: + strlcat(str, " Write", chars_left); + break; + case BLKIO_STAT_SYNC: + strlcat(str, " Sync", chars_left); + break; + case BLKIO_STAT_ASYNC: + strlcat(str, " Async", chars_left); + break; + case BLKIO_STAT_TOTAL: + strlcat(str, " Total", chars_left); + break; + default: + strlcat(str, " Invalid", chars_left); + } +} + +static uint64_t blkio_fill_stat(char *str, int chars_left, uint64_t val, + struct cgroup_map_cb *cb, dev_t dev) +{ + blkio_get_key_name(0, dev, str, chars_left, true); + cb->fill(cb, str, val); + return val; +} + + +uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, + enum stat_type_cpu type, enum stat_sub_type sub_type) +{ + int cpu; + struct blkio_group_stats_cpu *stats_cpu; + u64 val = 0, tval; + + for_each_possible_cpu(cpu) { + unsigned int start; + stats_cpu = per_cpu_ptr(blkg->stats_cpu, cpu); + + do { + start = u64_stats_fetch_begin(&stats_cpu->syncp); + if (type == BLKIO_STAT_CPU_SECTORS) + tval = stats_cpu->sectors; + else + tval = stats_cpu->stat_arr_cpu[type][sub_type]; + } while(u64_stats_fetch_retry(&stats_cpu->syncp, start)); + + val += tval; + } + + return val; +} + +static uint64_t blkio_get_stat_cpu(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type_cpu type) +{ + uint64_t disk_total, val; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_CPU_SECTORS) { + val = blkio_read_stat_cpu(blkg, type, 0); + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, val, cb, dev); + } + + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + val = blkio_read_stat_cpu(blkg, type, sub_type); + cb->fill(cb, key_str, val); + } + + disk_total = blkio_read_stat_cpu(blkg, type, BLKIO_STAT_READ) + + blkio_read_stat_cpu(blkg, type, BLKIO_STAT_WRITE); + + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; +} + +/* This should be called with blkg->stats_lock held */ +static uint64_t blkio_get_stat(struct blkio_group *blkg, + struct cgroup_map_cb *cb, dev_t dev, enum stat_type type) +{ + uint64_t disk_total; + char key_str[MAX_KEY_LEN]; + enum stat_sub_type sub_type; + + if (type == BLKIO_STAT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.time, cb, dev); +#ifdef CONFIG_DEBUG_BLK_CGROUP + if (type == BLKIO_STAT_AVG_QUEUE_SIZE) { + uint64_t sum = blkg->stats.avg_queue_size_sum; + uint64_t samples = blkg->stats.avg_queue_size_samples; + if (samples) + do_div(sum, samples); + else + sum = 0; + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, sum, cb, dev); + } + if (type == BLKIO_STAT_GROUP_WAIT_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.group_wait_time, cb, dev); + if (type == BLKIO_STAT_IDLE_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.idle_time, cb, dev); + if (type == BLKIO_STAT_EMPTY_TIME) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.empty_time, cb, dev); + if (type == BLKIO_STAT_DEQUEUE) + return blkio_fill_stat(key_str, MAX_KEY_LEN - 1, + blkg->stats.dequeue, cb, dev); +#endif + + for (sub_type = BLKIO_STAT_READ; sub_type < BLKIO_STAT_TOTAL; + sub_type++) { + blkio_get_key_name(sub_type, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, blkg->stats.stat_arr[type][sub_type]); + } + disk_total = blkg->stats.stat_arr[type][BLKIO_STAT_READ] + + blkg->stats.stat_arr[type][BLKIO_STAT_WRITE]; + blkio_get_key_name(BLKIO_STAT_TOTAL, dev, key_str, MAX_KEY_LEN, false); + cb->fill(cb, key_str, disk_total); + return disk_total; +} + +static int blkio_policy_parse_and_set(char *buf, + struct blkio_policy_node *newpn, enum blkio_policy_id plid, int fileid) +{ + struct gendisk *disk = NULL; + char *s[4], *p, *major_s = NULL, *minor_s = NULL; + unsigned long major, minor, temp; + int i = 0, ret = -EINVAL; + int part; + dev_t dev; + u64 bps, iops; + + memset(s, 0, sizeof(s)); + + /* Allow tab separated inputs too. Convert all tabs into spaces */ + while((p = strstr(buf, " ")) != NULL) + p[0] = ' '; + + while ((p = strsep(&buf, " ")) != NULL) { + if (!*p) + continue; + + s[i++] = p; + + /* Prevent from inputing too many things */ + if (i == 3) + break; + } + + if (i != 2) + goto out; + + p = strsep(&s[0], ":"); + if (p != NULL) + major_s = p; + else + goto out; + + minor_s = s[0]; + if (!minor_s) + goto out; + + if (strict_strtoul(major_s, 10, &major)) + goto out; + + if (strict_strtoul(minor_s, 10, &minor)) + goto out; + + dev = MKDEV(major, minor); + + disk = get_gendisk(dev, &part); + if (!disk || part) { + ret = -ENODEV; + goto out; + } + + newpn->dev = dev; + + if (s[1] == NULL) + return -EINVAL; + + switch (plid) { + case BLKIO_POLICY_PROP: + ret = strict_strtoul(s[1], 10, &temp); + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) || + temp > BLKIO_WEIGHT_MAX) + goto out; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.weight = temp; + break; + case BLKIO_POLICY_THROTL: + switch(fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + if (strict_strtoull(s[1], 10, &bps)) + goto out; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.bps = bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + if (strict_strtoull(s[1], 10, &iops)) + goto out; + + if (iops > THROTL_IOPS_MAX) + goto out; + + newpn->plid = plid; + newpn->fileid = fileid; + newpn->val.iops = (unsigned int)iops; + break; + } + break; + default: + BUG(); + } + ret = 0; +out: + put_disk(disk); + return ret; +} + +unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device); + if (pn) + return pn->val.weight; + else + return blkcg->weight; +} +EXPORT_SYMBOL_GPL(blkcg_get_weight); + +uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device); + if (pn) + return pn->val.bps; + else + return -1; +} + +uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device); + if (pn) + return pn->val.bps; + else + return -1; +} + +unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device); + if (pn) + return pn->val.iops; + else + return -1; +} + +unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, dev_t dev) +{ + struct blkio_policy_node *pn; + pn = blkio_policy_search_node(blkcg, dev, BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device); + if (pn) + return pn->val.iops; + else + return -1; +} + +/* Checks whether user asked for deleting a policy rule */ +static bool blkio_delete_rule_command(struct blkio_policy_node *pn) +{ + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->val.weight == 0) + return 1; + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + if (pn->val.bps == 0) + return 1; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + if (pn->val.iops == 0) + return 1; + } + break; + default: + BUG(); + } + + return 0; +} + +static void blkio_update_policy_rule(struct blkio_policy_node *oldpn, + struct blkio_policy_node *newpn) +{ + switch(oldpn->plid) { + case BLKIO_POLICY_PROP: + oldpn->val.weight = newpn->val.weight; + break; + case BLKIO_POLICY_THROTL: + switch(newpn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + oldpn->val.bps = newpn->val.bps; + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + oldpn->val.iops = newpn->val.iops; + } + break; + default: + BUG(); + } +} + +/* + * Some rules/values in blkg have changed. Propogate those to respective + * policies. + */ +static void blkio_update_blkg_policy(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, struct blkio_policy_node *pn) +{ + unsigned int weight, iops; + u64 bps; + + switch(pn->plid) { + case BLKIO_POLICY_PROP: + weight = pn->val.weight ? pn->val.weight : + blkcg->weight; + blkio_update_group_weight(blkg, weight); + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + bps = pn->val.bps ? pn->val.bps : (-1); + blkio_update_group_bps(blkg, bps, pn->fileid); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + iops = pn->val.iops ? pn->val.iops : (-1); + blkio_update_group_iops(blkg, iops, pn->fileid); + break; + } + break; + default: + BUG(); + } +} + +/* + * A policy node rule has been updated. Propogate this update to all the + * block groups which might be affected by this update. + */ +static void blkio_update_policy_node_blkg(struct blkio_cgroup *blkcg, + struct blkio_policy_node *pn) +{ + struct blkio_group *blkg; + struct hlist_node *n; + + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (pn->dev != blkg->dev || pn->plid != blkg->plid) + continue; + blkio_update_blkg_policy(blkcg, blkg, pn); + } + + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); +} + +static int blkiocg_file_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + int ret = 0; + char *buf; + struct blkio_policy_node *newpn, *pn; + struct blkio_cgroup *blkcg; + int keep_newpn = 0; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int fileid = BLKIOFILE_ATTR(cft->private); + + buf = kstrdup(buffer, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + newpn = kzalloc(sizeof(*newpn), GFP_KERNEL); + if (!newpn) { + ret = -ENOMEM; + goto free_buf; + } + + ret = blkio_policy_parse_and_set(buf, newpn, plid, fileid); + if (ret) + goto free_newpn; + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + spin_lock_irq(&blkcg->lock); + + pn = blkio_policy_search_node(blkcg, newpn->dev, plid, fileid); + if (!pn) { + if (!blkio_delete_rule_command(newpn)) { + blkio_policy_insert_node(blkcg, newpn); + keep_newpn = 1; + } + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + + if (blkio_delete_rule_command(newpn)) { + blkio_policy_delete_node(pn); + spin_unlock_irq(&blkcg->lock); + goto update_io_group; + } + spin_unlock_irq(&blkcg->lock); + + blkio_update_policy_rule(pn, newpn); + +update_io_group: + blkio_update_policy_node_blkg(blkcg, newpn); + +free_newpn: + if (!keep_newpn) + kfree(newpn); +free_buf: + kfree(buf); + return ret; +} + +static void +blkio_print_policy_node(struct seq_file *m, struct blkio_policy_node *pn) +{ + switch(pn->plid) { + case BLKIO_POLICY_PROP: + if (pn->fileid == BLKIO_PROP_weight_device) + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.weight); + break; + case BLKIO_POLICY_THROTL: + switch(pn->fileid) { + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + seq_printf(m, "%u:%u\t%llu\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.bps); + break; + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev), + MINOR(pn->dev), pn->val.iops); + break; + } + break; + default: + BUG(); + } +} + +/* cgroup files which read their data from policy nodes end up here */ +static void blkio_read_policy_node_files(struct cftype *cft, + struct blkio_cgroup *blkcg, struct seq_file *m) +{ + struct blkio_policy_node *pn; + + if (!list_empty(&blkcg->policy_list)) { + spin_lock_irq(&blkcg->lock); + list_for_each_entry(pn, &blkcg->policy_list, node) { + if (!pn_matches_cftype(cft, pn)) + continue; + blkio_print_policy_node(m, pn); + } + spin_unlock_irq(&blkcg->lock); + } +} + +static int blkiocg_file_read(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *m) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_read_bps_device: + case BLKIO_THROTL_write_bps_device: + case BLKIO_THROTL_read_iops_device: + case BLKIO_THROTL_write_iops_device: + blkio_read_policy_node_files(cft, blkcg, m); + return 0; + default: + BUG(); + } + break; + default: + BUG(); + } + + return 0; +} + +static int blkio_read_blkg_stats(struct blkio_cgroup *blkcg, + struct cftype *cft, struct cgroup_map_cb *cb, + enum stat_type type, bool show_total, bool pcpu) +{ + struct blkio_group *blkg; + struct hlist_node *n; + uint64_t cgroup_total = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, n, &blkcg->blkg_list, blkcg_node) { + if (blkg->dev) { + if (!cftype_blkg_same_policy(cft, blkg)) + continue; + if (pcpu) + cgroup_total += blkio_get_stat_cpu(blkg, cb, + blkg->dev, type); + else { + spin_lock_irq(&blkg->stats_lock); + cgroup_total += blkio_get_stat(blkg, cb, + blkg->dev, type); + spin_unlock_irq(&blkg->stats_lock); + } + } + } + if (show_total) + cb->fill(cb, "Total", cgroup_total); + rcu_read_unlock(); + return 0; +} + +/* All map kind of cgroup file get serviced by this function */ +static int blkiocg_file_read_map(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_TIME, 0, 0); + case BLKIO_PROP_sectors: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SECTORS, 0, 1); + case BLKIO_PROP_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); + case BLKIO_PROP_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SERVICED, 1, 1); + case BLKIO_PROP_io_service_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_SERVICE_TIME, 1, 0); + case BLKIO_PROP_io_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_WAIT_TIME, 1, 0); + case BLKIO_PROP_io_wait_max: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_WAIT_MAX, 0, 0); + case BLKIO_PROP_io_merged: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_MERGED, 1, 0); + case BLKIO_PROP_io_queued: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_QUEUED, 1, 0); +#ifdef CONFIG_DEBUG_BLK_CGROUP + case BLKIO_PROP_dequeue: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_DEQUEUE, 0, 0); + case BLKIO_PROP_avg_queue_size: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_AVG_QUEUE_SIZE, 0, 0); + case BLKIO_PROP_group_wait_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_GROUP_WAIT_TIME, 0, 0); + case BLKIO_PROP_idle_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_IDLE_TIME, 0, 0); + case BLKIO_PROP_empty_time: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_EMPTY_TIME, 0, 0); +#endif + default: + BUG(); + } + break; + case BLKIO_POLICY_THROTL: + switch(name){ + case BLKIO_THROTL_io_service_bytes: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SERVICE_BYTES, 1, 1); + case BLKIO_THROTL_io_serviced: + return blkio_read_blkg_stats(blkcg, cft, cb, + BLKIO_STAT_CPU_SERVICED, 1, 1); + default: + BUG(); + } + break; + default: + BUG(); + } + + return 0; +} + +static int blkio_weight_write(struct blkio_cgroup *blkcg, u64 val) +{ + struct blkio_group *blkg; + struct hlist_node *n; + struct blkio_policy_node *pn; + + if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX) + return -EINVAL; + + spin_lock(&blkio_list_lock); + spin_lock_irq(&blkcg->lock); + blkcg->weight = (unsigned int)val; + + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { + pn = blkio_policy_search_node(blkcg, blkg->dev, + BLKIO_POLICY_PROP, BLKIO_PROP_weight_device); + if (pn) + continue; + + blkio_update_group_weight(blkg, blkcg->weight); + } + spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); + return 0; +} + +static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) { + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return (u64)blkcg->weight; + } + break; + default: + BUG(); + } + return 0; +} + +static int +blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + struct blkio_cgroup *blkcg; + enum blkio_policy_id plid = BLKIOFILE_POLICY(cft->private); + int name = BLKIOFILE_ATTR(cft->private); + + blkcg = cgroup_to_blkio_cgroup(cgrp); + + switch(plid) { + case BLKIO_POLICY_PROP: + switch(name) { + case BLKIO_PROP_weight: + return blkio_weight_write(blkcg, val); + } + break; + default: + BUG(); + } + + return 0; +} + +int blkio_cgroup_set_weight(struct cgroup *cgroup, u64 weight) +{ + return blkio_weight_write(cgroup_to_blkio_cgroup(cgroup), weight); +} + +void blkio_cgroup_set_ub(struct cgroup *cgroup, struct user_beancounter *ub) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + struct blkio_group *blkg; + struct hlist_node *n; + unsigned long flags; + + spin_lock_irqsave(&blkcg->lock, flags); + blkcg->blk_ub = ub; + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) + blkg->blk_ub = ub; + spin_unlock_irqrestore(&blkcg->lock, flags); +} + +struct cftype blkio_files[] = { + { + .name = "weight_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + { + .name = "weight", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_weight), + .read_u64 = blkiocg_file_read_u64, + .write_u64 = blkiocg_file_write_u64, + }, + { + .name = "time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "sectors", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_sectors), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_service_bytes", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_bytes), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_serviced", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_serviced), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_service_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_service_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_wait_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_wait_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_wait_max", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_wait_max), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_merged", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_merged), + .read_map = blkiocg_file_read_map, + }, + { + .name = "io_queued", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_io_queued), + .read_map = blkiocg_file_read_map, + }, + { + .name = "reset_stats", + .write_u64 = blkiocg_reset_stats, + }, +#ifdef CONFIG_BLK_DEV_THROTTLING + { + .name = "throttle.read_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_bps_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_bps_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.read_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_read_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + + { + .name = "throttle.write_iops_device", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_write_iops_device), + .read_seq_string = blkiocg_file_read, + .write_string = blkiocg_file_write, + .max_write_len = 256, + }, + { + .name = "throttle.io_service_bytes", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_service_bytes), + .read_map = blkiocg_file_read_map, + }, + { + .name = "throttle.io_serviced", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_THROTL, + BLKIO_THROTL_io_serviced), + .read_map = blkiocg_file_read_map, + }, +#endif /* CONFIG_BLK_DEV_THROTTLING */ + +#ifdef CONFIG_DEBUG_BLK_CGROUP + { + .name = "avg_queue_size", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_avg_queue_size), + .read_map = blkiocg_file_read_map, + }, + { + .name = "group_wait_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_group_wait_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "idle_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_idle_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "empty_time", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_empty_time), + .read_map = blkiocg_file_read_map, + }, + { + .name = "dequeue", + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP, + BLKIO_PROP_dequeue), + .read_map = blkiocg_file_read_map, + }, +#endif +}; + +static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + return cgroup_add_files(cgroup, subsys, blkio_files, + ARRAY_SIZE(blkio_files)); +} + +static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup); + unsigned long flags; + struct blkio_group *blkg; + void *key; + struct blkio_policy_type *blkiop; + struct blkio_policy_node *pn, *pntmp; + + rcu_read_lock(); + do { + spin_lock_irqsave(&blkcg->lock, flags); + + if (hlist_empty(&blkcg->blkg_list)) { + spin_unlock_irqrestore(&blkcg->lock, flags); + break; + } + + blkg = hlist_entry(blkcg->blkg_list.first, struct blkio_group, + blkcg_node); + key = rcu_dereference(blkg->key); + __blkiocg_del_blkio_group(blkg); + + spin_unlock_irqrestore(&blkcg->lock, flags); + + /* + * This blkio_group is being unlinked as associated cgroup is + * going away. Let all the IO controlling policies know about + * this event. + */ + spin_lock(&blkio_list_lock); + list_for_each_entry(blkiop, &blkio_list, list) { + if (blkiop->plid != blkg->plid) + continue; + blkiop->ops.blkio_unlink_group_fn(key, blkg); + } + spin_unlock(&blkio_list_lock); + } while (1); + + list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) { + blkio_policy_delete_node(pn); + kfree(pn); + } + + free_css_id(&blkio_subsys, &blkcg->css); + rcu_read_unlock(); + kfree(blkcg); +} + +static struct cgroup_subsys_state * +blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup) +{ + struct blkio_cgroup *blkcg; + struct cgroup *parent = cgroup->parent; + + if (!parent) { + blkcg = &blkio_root_cgroup; + goto done; + } + + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) + return ERR_PTR(-ENOMEM); + + blkcg->weight = BLKIO_WEIGHT_DEFAULT; +done: + spin_lock_init(&blkcg->lock); + INIT_HLIST_HEAD(&blkcg->blkg_list); + + INIT_LIST_HEAD(&blkcg->policy_list); + blkcg->blk_ub = &ub0; + return &blkcg->css; +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. For now we allow a task to change + * its cgroup only if it's the only owner of its ioc. + */ +static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + task_lock(tsk); + ioc = tsk->io_context; + if (ioc && atomic_read(&ioc->nr_tasks) > 1) + ret = -EINVAL; + task_unlock(tsk); + + return ret; +} + +static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ + struct io_context *ioc; + + task_lock(tsk); + ioc = tsk->io_context; + if (ioc) + ioc->cgroup_changed = 1; + task_unlock(tsk); +} + +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach_task = blkiocg_can_attach_task, + .attach_task = blkiocg_attach_task, + .destroy = blkiocg_destroy, + .populate = blkiocg_populate, + .subsys_id = blkio_subsys_id, + .use_id = 1, +}; + +void blkio_policy_register(struct blkio_policy_type *blkiop) +{ + spin_lock(&blkio_list_lock); + list_add_tail(&blkiop->list, &blkio_list); + spin_unlock(&blkio_list_lock); +} +EXPORT_SYMBOL_GPL(blkio_policy_register); + +void blkio_policy_unregister(struct blkio_policy_type *blkiop) +{ + spin_lock(&blkio_list_lock); + list_del_init(&blkiop->list); + spin_unlock(&blkio_list_lock); +} +EXPORT_SYMBOL_GPL(blkio_policy_unregister); diff -uprN linux-2.6.32/block/blk-cgroup.h linux-2.6.32.ovz/block/blk-cgroup.h --- linux-2.6.32/block/blk-cgroup.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-cgroup.h 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,370 @@ +#ifndef _BLK_CGROUP_H +#define _BLK_CGROUP_H +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + */ + +#include +#include + +enum blkio_policy_id { + BLKIO_POLICY_PROP = 0, /* Proportional Bandwidth division */ + BLKIO_POLICY_THROTL, /* Throttling */ +}; + +/* Max limits for throttle policy */ +#define THROTL_IOPS_MAX UINT_MAX + +#ifdef CONFIG_BLK_CGROUP + +enum stat_type { + /* Total time spent (in ns) between request dispatch to the driver and + * request completion for IOs doen by this cgroup. This may not be + * accurate when NCQ is turned on. */ + BLKIO_STAT_SERVICE_TIME = 0, + /* Total time spent waiting in scheduler queue in ns */ + BLKIO_STAT_WAIT_TIME, + /* Maximum time spent waiting in scheduler queue in ns */ + BLKIO_STAT_WAIT_MAX, + /* Number of IOs merged */ + BLKIO_STAT_MERGED, + /* Number of IOs queued up */ + BLKIO_STAT_QUEUED, + /* All the single valued stats go below this */ + BLKIO_STAT_TIME, +#ifdef CONFIG_DEBUG_BLK_CGROUP + BLKIO_STAT_AVG_QUEUE_SIZE, + BLKIO_STAT_IDLE_TIME, + BLKIO_STAT_EMPTY_TIME, + BLKIO_STAT_GROUP_WAIT_TIME, + BLKIO_STAT_DEQUEUE +#endif +}; + +/* Per cpu stats */ +enum stat_type_cpu { + BLKIO_STAT_CPU_SECTORS, + /* Total bytes transferred */ + BLKIO_STAT_CPU_SERVICE_BYTES, + /* Total IOs serviced, post merge */ + BLKIO_STAT_CPU_SERVICED, + BLKIO_STAT_CPU_NR +}; + +enum stat_sub_type { + BLKIO_STAT_READ = 0, + BLKIO_STAT_WRITE, + BLKIO_STAT_SYNC, + BLKIO_STAT_ASYNC, + BLKIO_STAT_TOTAL +}; + +/* blkg state flags */ +enum blkg_state_flags { + BLKG_waiting = 0, + BLKG_idling, + BLKG_empty, +}; + +/* cgroup files owned by proportional weight policy */ +enum blkcg_file_name_prop { + BLKIO_PROP_weight = 1, + BLKIO_PROP_weight_device, + BLKIO_PROP_io_service_bytes, + BLKIO_PROP_io_serviced, + BLKIO_PROP_time, + BLKIO_PROP_sectors, + BLKIO_PROP_io_service_time, + BLKIO_PROP_io_wait_time, + BLKIO_PROP_io_wait_max, + BLKIO_PROP_io_merged, + BLKIO_PROP_io_queued, + BLKIO_PROP_avg_queue_size, + BLKIO_PROP_group_wait_time, + BLKIO_PROP_idle_time, + BLKIO_PROP_empty_time, + BLKIO_PROP_dequeue, +}; + +/* cgroup files owned by throttle policy */ +enum blkcg_file_name_throtl { + BLKIO_THROTL_read_bps_device, + BLKIO_THROTL_write_bps_device, + BLKIO_THROTL_read_iops_device, + BLKIO_THROTL_write_iops_device, + BLKIO_THROTL_io_service_bytes, + BLKIO_THROTL_io_serviced, +}; + +struct blkio_cgroup { + struct cgroup_subsys_state css; + unsigned int weight; + spinlock_t lock; + struct hlist_head blkg_list; + struct list_head policy_list; /* list of blkio_policy_node */ + struct user_beancounter *blk_ub; +}; + +struct blkio_group_stats { + /* total disk time and nr sectors dispatched by this group */ + uint64_t time; + uint64_t stat_arr[BLKIO_STAT_QUEUED + 1][BLKIO_STAT_TOTAL]; +#ifdef CONFIG_DEBUG_BLK_CGROUP + /* Sum of number of IOs queued across all samples */ + uint64_t avg_queue_size_sum; + /* Count of samples taken for average */ + uint64_t avg_queue_size_samples; + /* How many times this group has been removed from service tree */ + unsigned long dequeue; + + /* Total time spent waiting for it to be assigned a timeslice. */ + uint64_t group_wait_time; + uint64_t start_group_wait_time; + + /* Time spent idling for this blkio_group */ + uint64_t idle_time; + uint64_t start_idle_time; + /* + * Total time when we have requests queued and do not contain the + * current active queue. + */ + uint64_t empty_time; + uint64_t start_empty_time; + uint16_t flags; +#endif +}; + +/* Per cpu blkio group stats */ +struct blkio_group_stats_cpu { + uint64_t sectors; + uint64_t stat_arr_cpu[BLKIO_STAT_CPU_NR][BLKIO_STAT_TOTAL]; + struct u64_stats_sync syncp; +}; + +struct blkio_group { + /* An rcu protected unique identifier for the group */ + void *key; + struct hlist_node blkcg_node; + unsigned short blkcg_id; + /* Store cgroup path */ + char path[128]; + /* The device MKDEV(major, minor), this group has been created for */ + dev_t dev; + char *dev_name; + struct user_beancounter *blk_ub; + + /* policy which owns this blk group */ + enum blkio_policy_id plid; + + /* Need to serialize the stats in the case of reset/update */ + spinlock_t stats_lock; + struct blkio_group_stats stats; + /* Per cpu stats pointer */ + struct blkio_group_stats_cpu __percpu *stats_cpu; + + struct list_head stats_alloc_list; +}; + +struct blkio_policy_node { + struct list_head node; + dev_t dev; + /* This node belongs to max bw policy or porportional weight policy */ + enum blkio_policy_id plid; + /* cgroup file to which this rule belongs to */ + int fileid; + + union { + unsigned int weight; + /* + * Rate read/write in terms of byptes per second + * Whether this rate represents read or write is determined + * by file type "fileid". + */ + u64 bps; + unsigned int iops; + } val; +}; + +extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg, + dev_t dev); +extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_read_iops(struct blkio_cgroup *blkcg, + dev_t dev); +extern unsigned int blkcg_get_write_iops(struct blkio_cgroup *blkcg, + dev_t dev); + +typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg); + +typedef void (blkio_update_group_weight_fn) (void *key, + struct blkio_group *blkg, unsigned int weight); +typedef void (blkio_update_group_read_bps_fn) (void * key, + struct blkio_group *blkg, u64 read_bps); +typedef void (blkio_update_group_write_bps_fn) (void *key, + struct blkio_group *blkg, u64 write_bps); +typedef void (blkio_update_group_read_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int read_iops); +typedef void (blkio_update_group_write_iops_fn) (void *key, + struct blkio_group *blkg, unsigned int write_iops); +uint64_t blkio_read_stat_cpu(struct blkio_group *blkg, + enum stat_type_cpu type, enum stat_sub_type sub_type); + +struct blkio_policy_ops { + blkio_unlink_group_fn *blkio_unlink_group_fn; + blkio_update_group_weight_fn *blkio_update_group_weight_fn; + blkio_update_group_read_bps_fn *blkio_update_group_read_bps_fn; + blkio_update_group_write_bps_fn *blkio_update_group_write_bps_fn; + blkio_update_group_read_iops_fn *blkio_update_group_read_iops_fn; + blkio_update_group_write_iops_fn *blkio_update_group_write_iops_fn; +}; + +struct blkio_policy_type { + struct list_head list; + struct blkio_policy_ops ops; + enum blkio_policy_id plid; +}; + +/* Blkio controller policy registration */ +extern void blkio_policy_register(struct blkio_policy_type *); +extern void blkio_policy_unregister(struct blkio_policy_type *); + +static inline char *blkg_path(struct blkio_group *blkg) +{ + return blkg->path; +} + +#else + +struct blkio_group { +}; + +struct blkio_policy_type { +}; + +static inline void blkio_policy_register(struct blkio_policy_type *blkiop) { } +static inline void blkio_policy_unregister(struct blkio_policy_type *blkiop) { } + +static inline char *blkg_path(struct blkio_group *blkg) { return NULL; } + +#endif + +#define BLKIO_WEIGHT_MIN 100 +#define BLKIO_WEIGHT_MAX 1000 +#define BLKIO_WEIGHT_DEFAULT 500 + +#ifdef CONFIG_DEBUG_BLK_CGROUP +void blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg); +void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue); +void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg); +void blkiocg_update_idle_time_stats(struct blkio_group *blkg); +void blkiocg_set_start_empty_time(struct blkio_group *blkg); + +#define BLKG_FLAG_FNS(name) \ +static inline void blkio_mark_blkg_##name( \ + struct blkio_group_stats *stats) \ +{ \ + stats->flags |= (1 << BLKG_##name); \ +} \ +static inline void blkio_clear_blkg_##name( \ + struct blkio_group_stats *stats) \ +{ \ + stats->flags &= ~(1 << BLKG_##name); \ +} \ +static inline int blkio_blkg_##name(struct blkio_group_stats *stats) \ +{ \ + return (stats->flags & (1 << BLKG_##name)) != 0; \ +} \ + +BLKG_FLAG_FNS(waiting) +BLKG_FLAG_FNS(idling) +BLKG_FLAG_FNS(empty) +#undef BLKG_FLAG_FNS +#else +static inline void blkiocg_update_avg_queue_size_stats( + struct blkio_group *blkg) {} +static inline void blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) {} +static inline void blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{} +static inline void blkiocg_update_idle_time_stats(struct blkio_group *blkg) {} +static inline void blkiocg_set_start_empty_time(struct blkio_group *blkg) {} +#endif + +#ifdef CONFIG_BLK_CGROUP +extern struct blkio_cgroup blkio_root_cgroup; +extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); +extern struct blkio_cgroup *task_blkio_cgroup(struct task_struct *tsk); +extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid); +extern int blkio_alloc_blkg_stats(struct blkio_group *blkg); +extern void blkio_free_blkg_stats(struct blkio_group *blkg); +extern int blkiocg_del_blkio_group(struct blkio_group *blkg); +extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg, + void *key); +void blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time); +void blkiocg_update_dispatch_stats(struct blkio_group *blkg, uint64_t bytes, + bool direction, bool sync); +void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, bool sync); +void blkiocg_update_io_merged_stats(struct blkio_group *blkg, bool direction, + bool sync); +void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync); +void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync); +int blkio_cgroup_set_weight(struct cgroup *cgroup, u64 weight); +void blkio_cgroup_set_ub(struct cgroup *cgroup, struct user_beancounter *ub); +#else +struct cgroup; +static inline struct blkio_cgroup * +cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return NULL; } +static inline struct blkio_cgroup * +task_blkio_cgroup(struct task_struct *tsk) { return NULL; } + +static inline void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev, + enum blkio_policy_id plid) {} + +static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; } +static inline void blkio_free_blkg_stats(struct blkio_group *blkg) { } + +static inline int +blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; } + +static inline struct blkio_group * +blkiocg_lookup_group(struct blkio_cgroup *blkcg, void *key) { return NULL; } +static inline void blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) {} +static inline void blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) {} +static inline void blkiocg_update_completion_stats(struct blkio_group *blkg, + uint64_t start_time, uint64_t io_start_time, bool direction, + bool sync) {} +static inline void blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) {} +static inline void blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline int blkio_cgroup_set_weight(struct cgroup *cgroup, u64 weight) +{ + return -EINVAL; +} +static inline void blkio_cgroup_set_ub(struct cgroup *cgroup, + struct user_beancounter *ub) { } +#endif +#endif /* _BLK_CGROUP_H */ diff -uprN linux-2.6.32/block/blk-core.c linux-2.6.32.ovz/block/blk-core.c --- linux-2.6.32/block/blk-core.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-core.c 2015-06-23 04:16:16.000000000 -0700 @@ -27,6 +27,8 @@ #include #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -37,8 +39,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); -static int __make_request(struct request_queue *q, struct bio *bio); - /* * For the allocated request tables */ @@ -127,46 +127,36 @@ void blk_rq_init(struct request_queue *q rq->tag = -1; rq->ref_count = 1; rq->start_time = jiffies; + set_start_time_ns(rq); } EXPORT_SYMBOL(blk_rq_init); static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, int error) { - struct request_queue *q = rq->q; + if (error) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + error = -EIO; - if (&q->bar_rq != rq) { - if (error) - clear_bit(BIO_UPTODATE, &bio->bi_flags); - else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) - error = -EIO; - - if (unlikely(nbytes > bio->bi_size)) { - printk(KERN_ERR "%s: want %u bytes done, %u left\n", - __func__, nbytes, bio->bi_size); - nbytes = bio->bi_size; - } - - if (unlikely(rq->cmd_flags & REQ_QUIET)) - set_bit(BIO_QUIET, &bio->bi_flags); + if (unlikely(nbytes > bio->bi_size)) { + printk(KERN_ERR "%s: want %u bytes done, %u left\n", + __func__, nbytes, bio->bi_size); + nbytes = bio->bi_size; + } - bio->bi_size -= nbytes; - bio->bi_sector += (nbytes >> 9); + if (unlikely(rq->cmd_flags & REQ_QUIET)) + set_bit(BIO_QUIET, &bio->bi_flags); - if (bio_integrity(bio)) - bio_integrity_advance(bio, nbytes); + bio->bi_size -= nbytes; + bio->bi_sector += (nbytes >> 9); - if (bio->bi_size == 0) - bio_endio(bio, error); - } else { + if (bio_integrity(bio)) + bio_integrity_advance(bio, nbytes); - /* - * Okay, this is the barrier request in progress, just - * record the error; - */ - if (error && !q->orderr) - q->orderr = error; - } + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ)) + bio_endio(bio, error); } void blk_dump_rq_flags(struct request *rq, char *msg) @@ -183,7 +173,7 @@ void blk_dump_rq_flags(struct request *r printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); - if (blk_pc_request(rq)) { + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { printk(KERN_INFO " cdb: "); for (bit = 0; bit < BLK_MAX_CDB; bit++) printk("%02x ", rq->cmd[bit]); @@ -192,6 +182,16 @@ void blk_dump_rq_flags(struct request *r } EXPORT_SYMBOL(blk_dump_rq_flags); +static void blk_delay_work(struct work_struct *work) +{ + struct request_queue *q; + + q = container_of(work, struct request_queue, delay_work.work); + spin_lock_irq(q->queue_lock); + __blk_run_queue(q); + spin_unlock_irq(q->queue_lock); +} + /* * "plug" the device if there are no outstanding requests: this will * force the transfer to start only after we have put all the requests @@ -252,6 +252,67 @@ int blk_remove_plug(struct request_queue } EXPORT_SYMBOL(blk_remove_plug); +/** + * blk_drain_queue - drain requests from request_queue + * @q: queue to drain + * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV + * + * Drain requests from @q. If @drain_all is set, all requests are drained. + * If not, only ELVPRIV requests are drained. The caller is responsible + * for ensuring that no new requests which need to be drained are queued. + */ +void blk_drain_queue(struct request_queue *q, bool drain_all) +{ + int i; + + while (true) { + bool drain = false; + + spin_lock_irq(q->queue_lock); + + elv_drain_elevator(q); + if (drain_all) + blk_throtl_drain(q); + + __blk_run_queue(q); + + drain |= q->rq.elvpriv; + + /* + * Unfortunately, requests are queued at and tracked from + * multiple places and there's no single counter which can + * be drained. Check all the queues and counters. + */ + if (drain_all) { + drain |= !list_empty(&q->queue_head); + for (i = 0; i < 2; i++) { + drain |= q->rq.count[i]; + drain |= q->in_flight[i]; + drain |= !list_empty(&q->flush_queue[i]); + } + } + + spin_unlock_irq(q->queue_lock); + + if (!drain) + break; + msleep(10); + } + + /* + * With queue marked dead, any woken up waiter will fail the + * allocation path, so the wakeup chaining is lost and we're + * left with hung waiters. We need to wake up those waiters. + */ + if (q->request_fn) { + spin_lock_irq(q->queue_lock); + for (i = 0; i < ARRAY_SIZE(q->rq.wait); i++) + wake_up_all(&q->rq.wait[i]); + spin_unlock_irq(q->queue_lock); + } + +} + /* * remove the plug and let it rip.. */ @@ -358,6 +419,7 @@ EXPORT_SYMBOL(blk_start_queue); void blk_stop_queue(struct request_queue *q) { blk_remove_plug(q); + cancel_delayed_work(&q->delay_work); queue_flag_set(QUEUE_FLAG_STOPPED, q); } EXPORT_SYMBOL(blk_stop_queue); @@ -375,12 +437,17 @@ EXPORT_SYMBOL(blk_stop_queue); * that its ->make_request_fn will not re-add plugging prior to calling * this function. * + * This function does not cancel any asynchronous activity arising + * out of elevator or throttling code. That would require elevaotor_exit() + * and blk_throtl_exit() to be called with queue lock initialized. + * */ void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->unplug_timer); del_timer_sync(&q->timeout); cancel_work_sync(&q->unplug_work); + cancel_delayed_work_sync(&q->delay_work); } EXPORT_SYMBOL(blk_sync_queue); @@ -403,21 +470,28 @@ void __blk_run_queue(struct request_queu if (elv_queue_empty(q)) return; - /* - * Only recurse once to avoid overrunning the stack, let the unplug - * handling reinvoke the handler shortly if we already got there. - */ - if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else { - queue_flag_set(QUEUE_FLAG_PLUGGED, q); - kblockd_schedule_work(q, &q->unplug_work); - } + q->request_fn(q); } EXPORT_SYMBOL(__blk_run_queue); /** + * blk_run_queue_async - run a single device queue in workqueue context + * @q: The queue to run + * + * Description: + * Tells kblockd to perform the equivalent of @blk_run_queue on behalf + * of us. + */ +void blk_run_queue_async(struct request_queue *q) +{ + if (likely(!blk_queue_stopped(q))) { + __cancel_delayed_work(&q->delay_work); + queue_delayed_work(kblockd_workqueue, &q->delay_work, 0); + } +} +EXPORT_SYMBOL(blk_run_queue_async); + +/** * blk_run_queue - run a single device queue * @q: The queue to run * @@ -439,24 +513,45 @@ void blk_put_queue(struct request_queue { kobject_put(&q->kobj); } +EXPORT_SYMBOL(blk_put_queue); +/** + * blk_cleanup_queue - shutdown a request queue + * @q: request queue to shutdown + * + * Mark @q DEAD, drain all pending requests, destroy and put it. All + * future requests will be failed immediately with -ENODEV. + */ void blk_cleanup_queue(struct request_queue *q) { - /* - * We know we have process context here, so we can be a little - * cautious and ensure that pending block actions on this device - * are done before moving on. Going into this function, we should - * not have processes doing IO to this device. - */ - blk_sync_queue(q); + spinlock_t *lock = q->queue_lock; + /* mark @q DEAD, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q); + + spin_lock_irq(lock); + queue_flag_set(QUEUE_FLAG_NOMERGES, q); + queue_flag_set(QUEUE_FLAG_DEAD, q); + + if (q->queue_lock != &q->__queue_lock) + q->queue_lock = &q->__queue_lock; + + spin_unlock_irq(lock); mutex_unlock(&q->sysfs_lock); + /* + * Drain all requests queued before DEAD marking. The caller might + * be trying to tear down @q before its elevator is initialized, in + * which case we don't want to call into draining. + */ if (q->elevator) - elevator_exit(q->elevator); + blk_drain_queue(q, true); + + /* @q won't process any more reuqest, flush async actions */ + blk_sync_queue(q); + /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); @@ -465,6 +560,9 @@ static int blk_init_free_list(struct req { struct request_list *rl = &q->rq; + if (unlikely(rl->rq_pool)) + return 0; + rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; rl->elvpriv = 0; @@ -510,16 +608,37 @@ struct request_queue *blk_alloc_queue_no return NULL; } + if (blk_throtl_init(q)) { + kmem_cache_free(blk_requestq_cachep, q); + return NULL; + } + init_timer(&q->unplug_timer); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->timeout_list); + INIT_LIST_HEAD(&q->flush_queue[0]); + INIT_LIST_HEAD(&q->flush_queue[1]); + INIT_LIST_HEAD(&q->flush_data_in_flight); INIT_WORK(&q->unplug_work, blk_unplug_work); + INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); mutex_init(&q->sysfs_lock); spin_lock_init(&q->__queue_lock); + /* Initialize DEPRECATED barrier members */ + q->ordered = q->next_ordered = q->ordseq = 0; + q->orderr = q->ordcolor = 0; + q->orig_bar_rq = NULL; + + atomic_set(&q->flush_tag, 0); + /* + * By default initialize queue_lock to internal lock and driver can + * override it later if need be. + */ + q->queue_lock = &q->__queue_lock; + return q; } EXPORT_SYMBOL(blk_alloc_queue_node); @@ -566,27 +685,53 @@ EXPORT_SYMBOL(blk_init_queue); struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) { - struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id); + struct request_queue *uninit_q, *q; + uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id); + if (!uninit_q) + return NULL; + + q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id); + if (!q) + blk_cleanup_queue(uninit_q); + + return q; +} +EXPORT_SYMBOL(blk_init_queue_node); + +struct request_queue * +blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, + spinlock_t *lock) +{ + return blk_init_allocated_queue_node(q, rfn, lock, -1); +} +EXPORT_SYMBOL(blk_init_allocated_queue); + +struct request_queue * +blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn, + spinlock_t *lock, int node_id) +{ if (!q) return NULL; q->node = node_id; - if (blk_init_free_list(q)) { - kmem_cache_free(blk_requestq_cachep, q); + if (blk_init_free_list(q)) return NULL; - } q->request_fn = rfn; q->prep_rq_fn = NULL; + q->unprep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; q->queue_flags = QUEUE_FLAG_DEFAULT; - q->queue_lock = lock; + + /* Override internal queue lock with supplied lock pointer */ + if (lock) + q->queue_lock = lock; /* * This also sets hw/phys segments, boundary and size */ - blk_queue_make_request(q, __make_request); + blk_queue_make_request(q, blk_queue_bio); q->sg_reserved_size = INT_MAX; @@ -598,20 +743,20 @@ blk_init_queue_node(request_fn_proc *rfn return q; } - blk_put_queue(q); return NULL; } -EXPORT_SYMBOL(blk_init_queue_node); +EXPORT_SYMBOL(blk_init_allocated_queue_node); -int blk_get_queue(struct request_queue *q) +bool blk_get_request_queue(struct request_queue *q) { - if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) { - kobject_get(&q->kobj); - return 0; + if (likely(!blk_queue_dead(q))) { + __blk_get_queue(q); + return true; } - return 1; + return false; } +EXPORT_SYMBOL(blk_get_request_queue); static inline void blk_free_request(struct request_queue *q, struct request *rq) { @@ -621,7 +766,7 @@ static inline void blk_free_request(stru } static struct request * -blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask) +blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask) { struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask); @@ -632,12 +777,10 @@ blk_alloc_request(struct request_queue * rq->cmd_flags = flags | REQ_ALLOCED; - if (priv) { - if (unlikely(elv_set_request(q, rq, gfp_mask))) { - mempool_free(rq, q->rq.rq_pool); - return NULL; - } - rq->cmd_flags |= REQ_ELVPRIV; + if ((flags & REQ_ELVPRIV) && + unlikely(elv_set_request(q, rq, gfp_mask))) { + mempool_free(rq, q->rq.rq_pool); + return NULL; } return rq; @@ -696,12 +839,13 @@ static void __freed_request(struct reque * A request has just been released. Account for it, update the full and * congestion status, wake up any waiters. Called under q->queue_lock. */ -static void freed_request(struct request_queue *q, int sync, int priv) +static void freed_request(struct request_queue *q, unsigned int flags) { struct request_list *rl = &q->rq; + int sync = rw_is_sync(flags); rl->count[sync]--; - if (priv) + if (flags & REQ_ELVPRIV) rl->elvpriv--; __freed_request(q, sync); @@ -711,9 +855,37 @@ static void freed_request(struct request } /* - * Get a free request, queue_lock must be held. - * Returns NULL on failure, with queue_lock held. - * Returns !NULL on success, with queue_lock *not held*. + * Determine if elevator data should be initialized when allocating the + * request associated with @bio. + */ +static bool blk_rq_should_init_elevator(struct bio *bio) +{ + if (!bio) + return true; + + /* + * Flush requests do not use the elevator so skip initialization. + * This allows a request to share the flush and elevator data. + */ + if (bio->bi_rw & (BIO_FLUSH | BIO_FUA)) + return false; + + return true; +} + +/** + * get_request - get a free request + * @q: request_queue to allocate request from + * @rw_flags: RW and SYNC flags + * @bio: bio to allocate request for (can be %NULL) + * @gfp_mask: allocation mask + * + * Get a free request from @q. This function may fail under memory + * pressure or if @q is dead. + * + * Must be callled with @q->queue_lock held and, + * Returns %NULL on failure, with @q->queue_lock held. + * Returns !%NULL on success, with @q->queue_lock *not held*. */ static struct request *get_request(struct request_queue *q, int rw_flags, struct bio *bio, gfp_t gfp_mask) @@ -722,7 +894,10 @@ static struct request *get_request(struc struct request_list *rl = &q->rq; struct io_context *ioc = NULL; const bool is_sync = rw_is_sync(rw_flags) != 0; - int may_queue, priv; + int may_queue; + + if (unlikely(blk_queue_dead(q))) + return NULL; may_queue = elv_may_queue(q, rw_flags); if (may_queue == ELV_MQUEUE_NO) @@ -740,16 +915,15 @@ static struct request *get_request(struc if (!blk_queue_full(q, is_sync)) { ioc_set_batching(q, ioc); blk_set_queue_full(q, is_sync); - } else { - if (may_queue != ELV_MQUEUE_MUST - && !ioc_batching(q, ioc)) { - /* - * The queue is full and the allocating - * process is not a "batcher", and not - * exempted by the IO scheduler - */ - goto out; - } + } else if (may_queue == ELV_MQUEUE_MUST) { + ioc_set_batching(q, ioc); + } else if (!ioc_batching(q, ioc)) { + /* + * The queue is full and the allocating + * process is not a "batcher", and not + * exempted by the IO scheduler + */ + goto out; } } blk_set_queue_congested(q, is_sync); @@ -766,15 +940,17 @@ static struct request *get_request(struc rl->count[is_sync]++; rl->starved[is_sync] = 0; - priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags); - if (priv) + if (blk_rq_should_init_elevator(bio) && + !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) { + rw_flags |= REQ_ELVPRIV; rl->elvpriv++; + } if (blk_queue_io_stat(q)) rw_flags |= REQ_IO_STAT; spin_unlock_irq(q->queue_lock); - rq = blk_alloc_request(q, rw_flags, priv, gfp_mask); + rq = blk_alloc_request(q, rw_flags, gfp_mask); if (unlikely(!rq)) { /* * Allocation failed presumably due to memory. Undo anything @@ -784,7 +960,7 @@ static struct request *get_request(struc * wait queue, but this is pretty rare. */ spin_lock_irq(q->queue_lock); - freed_request(q, is_sync, priv); + freed_request(q, rw_flags); /* * in the very unlikely event that allocation failed and no @@ -814,11 +990,18 @@ out: return rq; } -/* - * No available requests for this queue, unplug the device and wait for some - * requests to become available. - * - * Called with q->queue_lock held, and returns with it unlocked. +/** + * get_request_wait - get a free request with retry + * @q: request_queue to allocate request from + * @rw_flags: RW and SYNC flags + * @bio: bio to allocate request for (can be %NULL) + * + * Get a free request from @q. This function keeps retrying under memory + * pressure and fails iff @q is dead. + * + * Must be called with @q->queue_lock held and, + * Returns %NULL on failure, with @q->queue_lock held. + * Returns !%NULL on success, with @q->queue_lock *not held*. */ static struct request *get_request_wait(struct request_queue *q, int rw_flags, struct bio *bio) @@ -832,6 +1015,9 @@ static struct request *get_request_wait( struct io_context *ioc; struct request_list *rl = &q->rq; + if (unlikely(blk_queue_dead(q))) + return NULL; + prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, TASK_UNINTERRUPTIBLE); @@ -866,13 +1052,12 @@ struct request *blk_get_request(struct r BUG_ON(rw != READ && rw != WRITE); spin_lock_irq(q->queue_lock); - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_WAIT) rq = get_request_wait(q, rw, NULL); - } else { + else rq = get_request(q, rw, NULL, gfp_mask); - if (!rq) - spin_unlock_irq(q->queue_lock); - } + if (!rq) + spin_unlock_irq(q->queue_lock); /* q->queue_lock is unlocked at this point */ return rq; @@ -915,8 +1100,11 @@ struct request *blk_make_request(struct { struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); - if (unlikely(!rq)) + if (unlikely(!rq)) { + if (gfp_mask & __GFP_WAIT) + return ERR_PTR(-ENODEV); return ERR_PTR(-ENOMEM); + } for_each_bio(bio) { struct bio *bounce_bio = bio; @@ -1008,22 +1196,6 @@ void blk_insert_request(struct request_q } EXPORT_SYMBOL(blk_insert_request); -/* - * add-request adds a request to the linked list. - * queue lock is held and interrupts disabled, as we muck with the - * request queue list. - */ -static inline void add_request(struct request_queue *q, struct request *req) -{ - drive_stat_acct(req, 1); - - /* - * elevator indicated where it wants this request to be - * inserted at elevator_merge time - */ - __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0); -} - static void part_round_stats_single(int cpu, struct hd_struct *part, unsigned long now) { @@ -1084,14 +1256,13 @@ void __blk_put_request(struct request_qu * it didn't come out of our reserved rq pools */ if (req->cmd_flags & REQ_ALLOCED) { - int is_sync = rq_is_sync(req) != 0; - int priv = req->cmd_flags & REQ_ELVPRIV; + unsigned int flags = req->cmd_flags; BUG_ON(!list_empty(&req->queuelist)); BUG_ON(!hlist_unhashed(&req->hash)); blk_free_request(q, req); - freed_request(q, is_sync, priv); + freed_request(q, flags); } } EXPORT_SYMBOL_GPL(__blk_put_request); @@ -1107,6 +1278,38 @@ void blk_put_request(struct request *req } EXPORT_SYMBOL(blk_put_request); +/** + * blk_add_request_payload - add a payload to a request + * @rq: request to update + * @page: page backing the payload + * @len: length of the payload. + * + * This allows to later add a payload to an already submitted request by + * a block driver. The driver needs to take care of freeing the payload + * itself. + * + * Note that this is a quite horrible hack and nothing but handling of + * discard requests should ever use it. + */ +void blk_add_request_payload(struct request *rq, struct page *page, + unsigned int len) +{ + struct bio *bio = rq->bio; + + bio->bi_io_vec->bv_page = page; + bio->bi_io_vec->bv_offset = 0; + bio->bi_io_vec->bv_len = len; + + bio->bi_size = len; + bio->bi_vcnt = 1; + bio->bi_phys_segments = 1; + + rq->__data_len = rq->resid_len = len; + rq->nr_phys_segments = 1; + rq->buffer = bio_data(bio); +} +EXPORT_SYMBOL_GPL(blk_add_request_payload); + void init_request_from_bio(struct request *req, struct bio *bio) { req->cpu = bio->bi_comp_cpu; @@ -1121,13 +1324,10 @@ void init_request_from_bio(struct reques else req->cmd_flags |= bio->bi_rw & REQ_FAILFAST_MASK; - if (unlikely(bio_rw_flagged(bio, BIO_RW_DISCARD))) { + if (bio_rw_flagged(bio, BIO_RW_DISCARD)) req->cmd_flags |= REQ_DISCARD; - if (bio_rw_flagged(bio, BIO_RW_BARRIER)) - req->cmd_flags |= REQ_SOFTBARRIER; - } else if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) + if (bio_rw_flagged(bio, BIO_RW_BARRIER)) req->cmd_flags |= REQ_HARDBARRIER; - if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) req->cmd_flags |= REQ_RW_SYNC; if (bio_rw_flagged(bio, BIO_RW_META)) @@ -1135,6 +1335,11 @@ void init_request_from_bio(struct reques if (bio_rw_flagged(bio, BIO_RW_NOIDLE)) req->cmd_flags |= REQ_NOIDLE; + if (bio_rw_flagged(bio, BIO_RW_FLUSH)) + req->cmd_flags |= REQ_FLUSH; + if (bio_rw_flagged(bio, BIO_RW_FUA)) + req->cmd_flags |= REQ_FUA; + req->errors = 0; req->__sector = bio->bi_sector; req->ioprio = bio_prio(bio); @@ -1147,10 +1352,32 @@ void init_request_from_bio(struct reques */ static inline bool queue_should_plug(struct request_queue *q) { - return !(blk_queue_nonrot(q) && blk_queue_queuing(q)); + return !(blk_queue_nonrot(q) && blk_queue_tagged(q)); +} + +static void blk_account_io_front_merge(struct request *req, sector_t newsector) +{ + if (blk_do_io_stat(req)) { + struct hd_struct *oldpart, *newpart; + int cpu; + + cpu = part_stat_lock(); + if (!is_same_part(req->rq_disk, req->__sector, newsector, + &oldpart, &newpart)) { + if (oldpart) { + part_round_stats(cpu, oldpart); + oldpart->in_flight[rq_data_dir(req)]--; + } + if (newpart) { + part_round_stats(cpu, newpart); + newpart->in_flight[rq_data_dir(req)]++; + } + } + part_stat_unlock(); + } } -static int __make_request(struct request_queue *q, struct bio *bio) +int blk_queue_bio(struct request_queue *q, struct bio *bio) { struct request *req; int el_ret; @@ -1159,13 +1386,16 @@ static int __make_request(struct request const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO); const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG); const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK; + int where = ELEVATOR_INSERT_SORT; int rw_flags; - if (bio_rw_flagged(bio, BIO_RW_BARRIER) && - (q->next_ordered == QUEUE_ORDERED_NONE)) { + /* BIO_RW_BARRIER is deprecated */ + if (WARN_ONCE(bio_rw_flagged(bio, BIO_RW_BARRIER), + "block: BARRIER is deprecated, use FLUSH/FUA instead\n")) { bio_endio(bio, -EOPNOTSUPP); return 0; } + /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even @@ -1173,9 +1403,19 @@ static int __make_request(struct request */ blk_queue_bounce(q, &bio); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return 0; + } + spin_lock_irq(q->queue_lock); - if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q)) + if (bio->bi_rw & (BIO_FLUSH | BIO_FUA)) { + where = ELEVATOR_INSERT_FLUSH; + goto get_rq; + } + + if (elv_queue_empty(q)) goto get_rq; el_ret = elv_merge(q, &req, bio); @@ -1198,6 +1438,7 @@ static int __make_request(struct request if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); if (!attempt_back_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; @@ -1225,12 +1466,18 @@ static int __make_request(struct request * not touch req->buffer either... */ req->buffer = bio_data(bio); + /* + * The merge may happen accross partitions + * We must update in_flight value accordingly + */ + blk_account_io_front_merge(req, bio->bi_sector); req->__sector = bio->bi_sector; req->__data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; drive_stat_acct(req, 0); + elv_bio_merged(q, req, bio); if (!attempt_front_merge(q, req)) elv_merged_request(q, req, el_ret); goto out; @@ -1248,13 +1495,17 @@ get_rq: */ rw_flags = bio_data_dir(bio); if (sync) - rw_flags |= REQ_RW_SYNC; + rw_flags |= REQ_SYNC; /* * Grab a free request. This is might sleep but can not fail. * Returns with the queue unlocked. */ req = get_request_wait(q, rw_flags, bio); + if (unlikely(!req)) { + bio_endio(bio, -ENODEV); /* @q is dead */ + goto out_unlock; + } /* * After dropping the lock and possibly sleeping here, our request @@ -1266,17 +1517,23 @@ get_rq: spin_lock_irq(q->queue_lock); if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) || - bio_flagged(bio, BIO_CPU_AFFINE)) - req->cpu = blk_cpu_to_group(smp_processor_id()); + bio_flagged(bio, BIO_CPU_AFFINE)) + req->cpu = raw_smp_processor_id(); + if (queue_should_plug(q) && elv_queue_empty(q)) blk_plug_device(q); - add_request(q, req); + + /* insert the request into the elevator */ + drive_stat_acct(req, 1); + __elv_add_request(q, req, where, 0); out: if (unplug || !queue_should_plug(q)) __generic_unplug_device(q); +out_unlock: spin_unlock_irq(q->queue_lock); return 0; } +EXPORT_SYMBOL_GPL(blk_queue_bio); /* for device mapper only */ /* * If bio->bi_dev is a partition, remap the location @@ -1306,7 +1563,7 @@ static void handle_bad_sector(struct bio bdevname(bio->bi_bdev, b), bio->bi_rw, (unsigned long long)bio->bi_sector + bio_sectors(bio), - (long long)(bio->bi_bdev->bd_inode->i_size >> 9)); + (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9)); set_bit(BIO_EOF, &bio->bi_flags); } @@ -1359,7 +1616,7 @@ static inline int bio_check_eod(struct b return 0; /* Test device or partition size, when known. */ - maxsector = bio->bi_bdev->bd_inode->i_size >> 9; + maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; if (maxsector) { sector_t sector = bio->bi_sector; @@ -1425,10 +1682,9 @@ static inline void __generic_make_reques old_sector = -1; old_dev = 0; do { - char b[BDEVNAME_SIZE]; - q = bdev_get_queue(bio->bi_bdev); if (unlikely(!q)) { + char b[BDEVNAME_SIZE]; printk(KERN_ERR "generic_make_request: Trying to access " "nonexistent block-device %s (%Lu)\n", @@ -1437,8 +1693,9 @@ static inline void __generic_make_reques goto end_io; } - if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) && - nr_sectors > queue_max_hw_sectors(q))) { + if (likely(bio_is_rw(bio) && + nr_sectors > queue_max_hw_sectors(q))) { + char b[BDEVNAME_SIZE]; printk(KERN_ERR "bio too big device %s (%u > %u)\n", bdevname(bio->bi_bdev, b), bio_sectors(bio), @@ -1446,9 +1703,6 @@ static inline void __generic_make_reques goto end_io; } - if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) - goto end_io; - if (should_fail_request(bio)) goto end_io; @@ -1458,9 +1712,6 @@ static inline void __generic_make_reques */ blk_partition_remap(bio); - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) - goto end_io; - if (old_sector != -1) trace_block_remap(q, bio, old_dev, old_sector); @@ -1470,12 +1721,28 @@ static inline void __generic_make_reques if (bio_check_eod(bio, nr_sectors)) goto end_io; - if (bio_rw_flagged(bio, BIO_RW_DISCARD) && + /* + * Filter flush bio's early so that make_request based + * drivers without flush support don't have to worry + * about them. + */ + if ((bio->bi_rw & (BIO_FLUSH | BIO_FUA)) && !q->flush_flags) { + bio->bi_rw &= ~(BIO_FLUSH | BIO_FUA); + if (!nr_sectors) { + err = 0; + goto end_io; + } + } + + if ((bio->bi_rw & BIO_DISCARD) && !blk_queue_discard(q)) { err = -EOPNOTSUPP; goto end_io; } + if (blk_throtl_bio(q, bio)) + break; /* throttled, will be resubmitted later */ + trace_block_bio_queue(q, bio); ret = q->make_request_fn(q, bio); @@ -1604,8 +1871,10 @@ EXPORT_SYMBOL(submit_bio); */ int blk_rq_check_limits(struct request_queue *q, struct request *rq) { - if (blk_rq_sectors(rq) > queue_max_sectors(q) || - blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) { + if (!rq_mergeable(rq)) + return 0; + + if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, rq->cmd_flags)) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; } @@ -1617,8 +1886,7 @@ int blk_rq_check_limits(struct request_q * limitation. */ blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_phys_segments(q) || - rq->nr_phys_segments > queue_max_hw_segments(q)) { + if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit.\n", __func__); return -EIO; } @@ -1646,6 +1914,10 @@ int blk_insert_cloned_request(struct req #endif spin_lock_irqsave(q->queue_lock, flags); + if (unlikely(blk_queue_dead(q))) { + spin_unlock_irqrestore(q->queue_lock, flags); + return -ENODEV; + } /* * Submitting request must be dequeued before calling this function @@ -1723,11 +1995,11 @@ static void blk_account_io_completion(st static void blk_account_io_done(struct request *req) { /* - * Account IO completion. bar_rq isn't accounted as a normal - * IO on queueing nor completion. Accounting the containing - * request is enough. + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. */ - if (blk_do_io_stat(req) && req != &req->q->bar_rq) { + if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; @@ -1773,7 +2045,7 @@ struct request *blk_peek_request(struct * sees this request (possibly after * requeueing). Notify IO scheduler. */ - if (blk_sorted_rq(rq)) + if (rq->cmd_flags & REQ_SORTED) elv_activate_rq(q, rq); /* @@ -1861,12 +2133,7 @@ void blk_dequeue_request(struct request */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]++; - /* - * Mark this device as supporting hardware queuing, if - * we have more IOs in flight than 4. - */ - if (!blk_queue_queuing(q) && queue_in_flight(q) > 4) - set_bit(QUEUE_FLAG_CQ, &q->queue_flags); + set_io_start_time_ns(rq); } } @@ -1896,6 +2163,7 @@ void blk_start_request(struct request *r if (unlikely(blk_bidi_rq(req))) req->next_rq->resid_len = blk_rq_bytes(req->next_rq); + BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); @@ -1966,13 +2234,38 @@ bool blk_update_request(struct request * * TODO: tj: This is too subtle. It would be better to let * low level drivers do what they see fit. */ - if (blk_fs_request(req)) + if (req->cmd_type == REQ_TYPE_FS) req->errors = 0; - if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { - printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", - req->rq_disk ? req->rq_disk->disk_name : "?", - (unsigned long long)blk_rq_pos(req)); + if (error && req->cmd_type == REQ_TYPE_FS && + !(req->cmd_flags & REQ_QUIET)) { + char *error_type; + + switch (error) { + case -ENOLINK: + error_type = "recoverable transport"; + break; + case -EREMOTEIO: + error_type = "critical target"; + break; + case -EBADE: + error_type = "critical nexus"; + break; + case -ENOSPC: + error_type = "critical space allocation"; + break; + case -ENODATA: + error_type = "critical medium"; + break; + case -EIO: + default: + error_type = "I/O"; + break; + } + printk_ratelimited(KERN_ERR "end_request: %s error, dev %s, " + "sector %llu\n", error_type, req->rq_disk ? + req->rq_disk->disk_name : "?", + (unsigned long long)blk_rq_pos(req)); } blk_account_io_completion(req, nr_bytes); @@ -2056,7 +2349,7 @@ bool blk_update_request(struct request * req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ - if (blk_fs_request(req) || blk_discard_rq(req)) + if (req->cmd_type == REQ_TYPE_FS) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ @@ -2070,7 +2363,7 @@ bool blk_update_request(struct request * * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { - printk(KERN_ERR "blk: request botched\n"); + blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } @@ -2093,11 +2386,32 @@ static bool blk_update_bidi_request(stru blk_update_request(rq->next_rq, error, bidi_bytes)) return true; - add_disk_randomness(rq->rq_disk); + if (blk_queue_add_random(rq->q)) + add_disk_randomness(rq->rq_disk); return false; } +/** + * blk_unprep_request - unprepare a request + * @req: the request + * + * This function makes a request ready for complete resubmission (or + * completion). It happens only after all error handling is complete, + * so represents the appropriate moment to deallocate any resources + * that were allocated to the request in the prep_rq_fn. The queue + * lock is held when calling this. + */ +void blk_unprep_request(struct request *req) +{ + struct request_queue *q = req->q; + + req->cmd_flags &= ~REQ_DONTPREP; + if (q->unprep_rq_fn) + q->unprep_rq_fn(q, req); +} +EXPORT_SYMBOL_GPL(blk_unprep_request); + /* * queue lock must be held */ @@ -2108,11 +2422,15 @@ static void blk_finish_request(struct re BUG_ON(blk_queued_rq(req)); - if (unlikely(laptop_mode) && blk_fs_request(req)) + if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS) laptop_io_completion(); blk_delete_timer(req); + if (req->cmd_flags & REQ_DONTPREP) + blk_unprep_request(req); + + blk_account_io_done(req); if (req->end_io) @@ -2345,7 +2663,7 @@ void blk_rq_bio_prep(struct request_queu struct bio *bio) { /* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */ - rq->cmd_flags |= bio->bi_rw & REQ_RW; + rq->cmd_flags |= bio->bi_rw & REQ_WRITE; if (bio_has_data(bio)) { rq->nr_phys_segments = bio_phys_segments(q, bio); @@ -2412,7 +2730,7 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); static void __blk_rq_prep_clone(struct request *dst, struct request *src) { dst->cpu = src->cpu; - dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE); + dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE; dst->cmd_type = src->cmd_type; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); @@ -2509,4 +2827,3 @@ int __init blk_dev_init(void) return 0; } - diff -uprN linux-2.6.32/block/blk-exec.c linux-2.6.32.ovz/block/blk-exec.c --- linux-2.6.32/block/blk-exec.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-exec.c 2015-06-23 04:16:16.000000000 -0700 @@ -43,6 +43,9 @@ static void blk_end_sync_rq(struct reque * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. + * + * Note: + * This function will invoke @done directly if the queue is dead */ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head, @@ -50,14 +53,25 @@ void blk_execute_rq_nowait(struct reques { int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; + WARN_ON(irqs_disabled()); + rq->rq_disk = bd_disk; rq->end_io = done; - WARN_ON(irqs_disabled()); + spin_lock_irq(q->queue_lock); + + if (unlikely(blk_queue_dead(q))) { + rq->cmd_flags |= REQ_QUIET; + rq->errors = -ENXIO; + __blk_end_request_all(rq, rq->errors); + spin_unlock_irq(q->queue_lock); + return; + } + __elv_add_request(q, rq, where, 1); __generic_unplug_device(q); /* the queue is stopped so it won't be plugged+unplugged */ - if (blk_pm_resume_request(rq)) + if (rq->cmd_type == REQ_TYPE_PM_RESUME) q->request_fn(q); spin_unlock_irq(q->queue_lock); } @@ -95,7 +109,7 @@ int blk_execute_rq(struct request_queue rq->end_io_data = &wait; blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); - wait_for_completion(&wait); + wait_for_completion_io(&wait); if (rq->errors) err = -EIO; diff -uprN linux-2.6.32/block/blk-flush.c linux-2.6.32.ovz/block/blk-flush.c --- linux-2.6.32/block/blk-flush.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-flush.c 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,462 @@ +/* + * Functions to sequence FLUSH and FUA writes. + * + * Copyright (C) 2011 Max Planck Institute for Gravitational Physics + * Copyright (C) 2011 Tejun Heo + * + * This file is released under the GPLv2. + * + * REQ_{FLUSH|FUA} requests are decomposed to sequences consisted of three + * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request + * properties and hardware capability. + * + * If a request doesn't have data, only REQ_FLUSH makes sense, which + * indicates a simple flush request. If there is data, REQ_FLUSH indicates + * that the device cache should be flushed before the data is executed, and + * REQ_FUA means that the data must be on non-volatile media on request + * completion. + * + * If the device doesn't have writeback cache, FLUSH and FUA don't make any + * difference. The requests are either completed immediately if there's no + * data or executed as normal requests otherwise. + * + * If the device has writeback cache and supports FUA, REQ_FLUSH is + * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. + * + * If the device has writeback cache and doesn't support FUA, REQ_FLUSH is + * translated to PREFLUSH and REQ_FUA to POSTFLUSH. + * + * The actual execution of flush is double buffered. Whenever a request + * needs to execute PRE or POSTFLUSH, it queues at + * q->flush_queue[q->flush_pending_idx]. Once certain criteria are met, a + * flush is issued and the pending_idx is toggled. When the flush + * completes, all the requests which were pending are proceeded to the next + * step. This allows arbitrary merging of different types of FLUSH/FUA + * requests. + * + * Currently, the following conditions are used to determine when to issue + * flush. + * + * C1. At any given time, only one flush shall be in progress. This makes + * double buffering sufficient. + * + * C2. Flush is deferred if any request is executing DATA of its sequence. + * This avoids issuing separate POSTFLUSHes for requests which shared + * PREFLUSH. + * + * C3. The second condition is ignored if there is a request which has + * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid + * starvation in the unlikely case where there are continuous stream of + * FUA (without FLUSH) requests. + * + * For devices which support FUA, it isn't clear whether C2 (and thus C3) + * is beneficial. + * + * Note that a sequenced FLUSH/FUA request with DATA is completed twice. + * Once while executing DATA and again after the whole sequence is + * complete. The first completion updates the contained bio but doesn't + * finish it so that the bio submitter is notified only after the whole + * sequence is complete. This is implemented by testing REQ_FLUSH_SEQ in + * req_bio_endio(). + * + * The above peculiarity requires that each FLUSH/FUA request has only one + * bio attached to it, which is guaranteed as they aren't allowed to be + * merged in the usual way. + */ + +#include +#include +#include +#include +#include + +#include "blk.h" + +/* FLUSH/FUA sequences */ +enum { + REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ + REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ + REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ + REQ_FSEQ_DONE = (1 << 3), + + REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | + REQ_FSEQ_POSTFLUSH, + + /* + * If flush has been pending longer than the following timeout, + * it's issued even if flush_data requests are still in flight. + */ + FLUSH_PENDING_TIMEOUT = 5 * HZ, +}; + +static bool blk_kick_flush(struct request_queue *q); + +static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq) +{ + unsigned int policy = 0; + + if (fflags & REQ_FLUSH) { + if (rq->cmd_flags & REQ_FLUSH) + policy |= REQ_FSEQ_PREFLUSH; + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA)) + policy |= REQ_FSEQ_POSTFLUSH; + } + return policy; +} + +static unsigned int blk_flush_cur_seq(struct request *rq) +{ + return 1 << ffz(rq->flush.seq); +} + +static void blk_flush_restore_request(struct request *rq) +{ + /* + * After flush data completion, @rq->bio is %NULL but we need to + * complete the bio again. @rq->biotail is guaranteed to equal the + * original @rq->bio. Restore it. + */ + rq->bio = rq->biotail; + + /* make @rq a normal request */ + rq->cmd_flags &= ~REQ_FLUSH_SEQ; + rq->end_io = NULL; +} + +/** + * blk_flush_complete_seq - complete flush sequence + * @rq: FLUSH/FUA request being sequenced + * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) + * @error: whether an error occurred + * + * @rq just completed @seq part of its flush sequence, record the + * completion and trigger the next step. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + * + * RETURNS: + * %true if requests were added to the dispatch queue, %false otherwise. + */ +static bool blk_flush_complete_seq(struct request *rq, unsigned int seq, + int error) +{ + struct request_queue *q = rq->q; + struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + bool queued = false; + + BUG_ON(rq->flush.seq & seq); + rq->flush.seq |= seq; + + if (likely(!error)) + seq = blk_flush_cur_seq(rq); + else + seq = REQ_FSEQ_DONE; + + switch (seq) { + case REQ_FSEQ_PREFLUSH: + case REQ_FSEQ_POSTFLUSH: + /* queue for flush */ + if (list_empty(pending)) + q->flush_pending_since = jiffies; + list_move_tail(&rq->flush.list, pending); + break; + + case REQ_FSEQ_DATA: + list_move_tail(&rq->flush.list, &q->flush_data_in_flight); + list_add(&rq->queuelist, &q->queue_head); + queued = true; + break; + + case REQ_FSEQ_DONE: + /* + * @rq was previously adjusted by blk_flush_issue() for + * flush sequencing and may already have gone through the + * flush data request completion path. Restore @rq for + * normal completion and end it. + */ + BUG_ON(!list_empty(&rq->queuelist)); + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + __blk_end_request_all(rq, error); + break; + + default: + BUG(); + } + + return blk_kick_flush(q) | queued; +} + +static void flush_end_io(struct request *flush_rq, int error) +{ + struct request_queue *q = flush_rq->q; + struct list_head *running = &q->flush_queue[q->flush_running_idx]; + bool was_empty = elv_queue_empty(q); + bool queued = false; + struct request *rq, *n; + + BUG_ON(q->flush_pending_idx == q->flush_running_idx); + + /* account completion of the flush request */ + q->flush_running_idx ^= 1; + elv_completed_request(q, flush_rq); + atomic_inc(&q->flush_tag); + /* and push the waiting requests to the next stage */ + list_for_each_entry_safe(rq, n, running, flush.list) { + unsigned int seq = blk_flush_cur_seq(rq); + + BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + queued |= blk_flush_complete_seq(rq, seq, error); + } + + /* + * Kick the queue to avoid stall for two cases: + * 1. Moving a request silently to empty queue_head may stall the + * queue. + * 2. When flush request is running in non-queueable queue, the + * queue is hold. Restart the queue after flush request is finished + * to avoid stall. + * This function is called from request completion path and calling + * directly into request_fn may confuse the driver. Always use + * kblockd. + */ + if ((queued && was_empty) || q->flush_queue_delayed) + blk_run_queue_async(q); + q->flush_queue_delayed = 0; +} + +/** + * blk_kick_flush - consider issuing flush request + * @q: request_queue being kicked + * + * Flush related states of @q have changed, consider issuing flush request. + * Please read the comment at the top of this file for more info. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + * + * RETURNS: + * %true if flush was issued, %false otherwise. + */ +static bool blk_kick_flush(struct request_queue *q) +{ + struct list_head *pending = &q->flush_queue[q->flush_pending_idx]; + struct request *first_rq = + list_first_entry(pending, struct request, flush.list); + + /* C1 described at the top of this file */ + if (q->flush_pending_idx != q->flush_running_idx || list_empty(pending)) + return false; + + /* C2 and C3 */ + if (!list_empty(&q->flush_data_in_flight) && + time_before(jiffies, + q->flush_pending_since + FLUSH_PENDING_TIMEOUT)) + return false; + + /* + * Issue flush and toggle pending_idx. This makes pending_idx + * different from running_idx, which means flush is in flight. + */ + blk_rq_init(q, &q->flush_rq); + q->flush_rq.cmd_type = REQ_TYPE_FS; + q->flush_rq.cmd_flags = REQ_WRITE_FLUSH | REQ_FLUSH_SEQ; + + /* + * For compatibility with drivers using old semantics of providing + * a prepare_flush_fn and not parsing REQ_FLUSH flag. + */ + if (unlikely(q->prepare_flush_fn)) { + q->flush_rq.cmd_flags = REQ_HARDBARRIER | REQ_FLUSH_SEQ; + q->prepare_flush_fn(q, &q->flush_rq); + } + + q->flush_rq.rq_disk = first_rq->rq_disk; + q->flush_rq.end_io = flush_end_io; + + q->flush_pending_idx ^= 1; + atomic_inc(&q->flush_tag); + list_add_tail(&q->flush_rq.queuelist, &q->queue_head); + return true; +} + +static void flush_data_end_io(struct request *rq, int error) +{ + struct request_queue *q = rq->q; + bool was_empty = elv_queue_empty(q); + + /* + * After populating an empty queue, kick it to avoid stall. Read + * the comment in flush_end_io(). + */ + if (blk_flush_complete_seq(rq, REQ_FSEQ_DATA, error) && was_empty) + blk_run_queue_async(q); +} + +/** + * blk_insert_flush - insert a new FLUSH/FUA request + * @rq: request to insert + * + * To be called from elv_insert() for %ELEVATOR_INSERT_FLUSH insertions. + * @rq is being submitted. Analyze what needs to be done and put it on the + * right queue. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + */ +void blk_insert_flush(struct request *rq) +{ + struct request_queue *q = rq->q; + unsigned int fflags = q->flush_flags; /* may change, cache */ + unsigned int policy = blk_flush_policy(fflags, rq); + + BUG_ON(rq->end_io); + BUG_ON(!rq->bio || rq->bio != rq->biotail); + + /* + * @policy now records what operations need to be done. Adjust + * REQ_FLUSH and FUA for the driver. + */ + rq->cmd_flags &= ~REQ_FLUSH; + if (!(fflags & REQ_FUA)) + rq->cmd_flags &= ~REQ_FUA; + + /* + * If there's data but flush is not necessary, the request can be + * processed directly without going through flush machinery. Queue + * for normal execution. + */ + if ((policy & REQ_FSEQ_DATA) && + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { + list_add_tail(&rq->queuelist, &q->queue_head); + return; + } + + /* + * @rq should go through flush machinery. Mark it part of flush + * sequence and submit for further processing. + */ + memset(&rq->flush, 0, sizeof(rq->flush)); + INIT_LIST_HEAD(&rq->flush.list); + rq->cmd_flags |= REQ_FLUSH_SEQ; + rq->end_io = flush_data_end_io; + + blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0); +} + +/** + * blk_abort_flushes - @q is being aborted, abort flush requests + * @q: request_queue being aborted + * + * To be called from elv_abort_queue(). @q is being aborted. Prepare all + * FLUSH/FUA requests for abortion. + * + * CONTEXT: + * spin_lock_irq(q->queue_lock) + */ +void blk_abort_flushes(struct request_queue *q) +{ + struct request *rq, *n; + int i; + + /* + * Requests in flight for data are already owned by the dispatch + * queue or the device driver. Just restore for normal completion. + */ + list_for_each_entry_safe(rq, n, &q->flush_data_in_flight, flush.list) { + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + } + + /* + * We need to give away requests on flush queues. Restore for + * normal completion and put them on the dispatch queue. + */ + for (i = 0; i < ARRAY_SIZE(q->flush_queue); i++) { + list_for_each_entry_safe(rq, n, &q->flush_queue[i], + flush.list) { + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + list_add_tail(&rq->queuelist, &q->queue_head); + } + } +} + +static void bio_end_flush(struct bio *bio, int err) +{ + if (err) + clear_bit(BIO_UPTODATE, &bio->bi_flags); + if (bio->bi_private) + complete(bio->bi_private); + bio_put(bio); +} + +/** + * __blkdev_issue_flush - queue a flush + * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) + * @error_sector: error sector + * + * Description: + * Issue a flush for the block device in question. Caller can supply + * room for storing the error offset in case of a flush error, if they + * wish to. If WAIT flag is not passed then caller may check only what + * request was pushed in some internal queue for later handling. + */ +int __blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + sector_t *error_sector) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q; + struct bio *bio; + int ret = 0; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + /* + * some block devices may not have their queue correctly set up here + * (e.g. loop device without a backing file) and so issuing a flush + * here will panic. Ensure there is a request function before issuing + * the flush. + */ + if (!q->make_request_fn) + return -ENXIO; + + bio = bio_alloc(gfp_mask, 0); + bio->bi_end_io = bio_end_flush; + bio->bi_bdev = bdev; + bio->bi_private = &wait; + + bio_get(bio); + submit_bio(WRITE_FLUSH, bio); + wait_for_completion_io(&wait); + + /* + * The driver must store the error location in ->bi_sector, if + * it supports it. For non-stacked drivers, this should be + * copied from blk_rq_pos(rq). + */ + if (error_sector) + *error_sector = bio->bi_sector; + + if (!bio_flagged(bio, BIO_UPTODATE)) + ret = -EIO; + + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(__blkdev_issue_flush); + +int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) +{ + return __blkdev_issue_flush(bdev, GFP_KERNEL, error_sector); +} +EXPORT_SYMBOL(blkdev_issue_flush); diff -uprN linux-2.6.32/block/blk.h linux-2.6.32.ovz/block/blk.h --- linux-2.6.32/block/blk.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk.h 2015-03-10 13:24:13.000000000 -0700 @@ -10,11 +10,17 @@ extern struct kmem_cache *blk_requestq_cachep; extern struct kobj_type blk_queue_ktype; +static inline void __blk_get_queue(struct request_queue *q) +{ + kobject_get(&q->kobj); +} + void init_request_from_bio(struct request *req, struct bio *bio); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); int blk_rq_append_bio(struct request_queue *q, struct request *rq, struct bio *bio); +void blk_drain_queue(struct request_queue *q, bool drain_all); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); @@ -51,18 +57,40 @@ static inline void blk_clear_rq_complete */ #define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) +void blk_insert_flush(struct request *rq); +void blk_abort_flushes(struct request_queue *q); + static inline struct request *__elv_next_request(struct request_queue *q) { struct request *rq; while (1) { - while (!list_empty(&q->queue_head)) { + if (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); - if (blk_do_ordered(q, &rq)) - return rq; + return rq; } - - if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) + /* + * Flush request is running and flush request isn't queueable + * in the drive, we can hold the queue till flush request is + * finished. Even we don't do this, driver can't dispatch next + * requests and will requeue them. And this can improve + * throughput too. For example, we have request flush1, write1, + * flush 2. flush1 is dispatched, then queue is hold, write1 + * isn't inserted to queue. After flush1 is finished, flush2 + * will be dispatched. Since disk cache is already clean, + * flush2 will be finished very soon, so looks like flush2 is + * folded to flush1. + * Since the queue is hold, a flag is set to indicate the queue + * should be restarted later. Please see flush_end_io() for + * details. + */ + if (q->flush_pending_idx != q->flush_running_idx && + !queue_flush_queueable(q)) { + q->flush_queue_delayed = 1; + return NULL; + } + if (unlikely(blk_queue_dead(q)) || + !q->elevator->ops->elevator_dispatch_fn(q, 0)) return NULL; } } @@ -105,6 +133,8 @@ int attempt_back_merge(struct request_qu int attempt_front_merge(struct request_queue *q, struct request *rq); void blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); +bool blk_rq_merge_ok(struct request *rq, struct bio *bio); +int blk_try_merge(struct request *rq, struct bio *bio); void blk_queue_congestion_threshold(struct request_queue *q); @@ -142,14 +172,19 @@ static inline int queue_congestion_off_t static inline int blk_cpu_to_group(int cpu) { + int group = NR_CPUS; #ifdef CONFIG_SCHED_MC const struct cpumask *mask = cpu_coregroup_mask(cpu); - return cpumask_first(mask); + group = cpumask_first(mask); #elif defined(CONFIG_SCHED_SMT) - return cpumask_first(topology_thread_cpumask(cpu)); + group = cpumask_first(topology_thread_cpumask(cpu)); #else return cpu; #endif + + if (likely(group < NR_CPUS)) + return group; + return cpu; } /* @@ -157,12 +192,30 @@ static inline int blk_cpu_to_group(int c * * a) it's attached to a gendisk, and * b) the queue had IO stats enabled when this request was started, and - * c) it's a file system request or a discard request + * c) it's a file system request */ static inline int blk_do_io_stat(struct request *rq) { - return rq->rq_disk && blk_rq_io_stat(rq) && - (blk_fs_request(rq) || blk_discard_rq(rq)); -} + return rq->rq_disk && + (rq->cmd_flags & REQ_IO_STAT) && + (rq->cmd_type == REQ_TYPE_FS); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING +extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio); +extern void blk_throtl_drain(struct request_queue *q); +extern int blk_throtl_init(struct request_queue *q); +extern void blk_throtl_exit(struct request_queue *q); +extern void blk_throtl_release(struct request_queue *q); +#else /* CONFIG_BLK_DEV_THROTTLING */ +static inline bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +{ + return false; +} +static inline void blk_throtl_drain(struct request_queue *q) { } +static inline int blk_throtl_init(struct request_queue *q) { return 0; } +static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_release(struct request_queue *q) { } +#endif /* CONFIG_BLK_DEV_THROTTLING */ -#endif +#endif /* BLK_INTERNAL_H */ diff -uprN linux-2.6.32/block/blk-integrity.c linux-2.6.32.ovz/block/blk-integrity.c --- linux-2.6.32/block/blk-integrity.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-integrity.c 2015-03-10 13:23:56.000000000 -0700 @@ -29,6 +29,8 @@ static struct kmem_cache *integrity_cachep; +static const char *bi_unsupported_name = "unsupported"; + /** * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements * @rq: request with integrity metadata attached @@ -278,7 +280,7 @@ static struct attribute *integrity_attrs NULL, }; -static struct sysfs_ops integrity_ops = { +static const struct sysfs_ops integrity_ops = { .show = &integrity_attr_show, .store = &integrity_attr_store, }; @@ -306,6 +308,14 @@ static struct kobj_type integrity_ktype .release = blk_integrity_release, }; +bool blk_integrity_is_initialized(struct gendisk *disk) +{ + struct blk_integrity *bi = blk_get_integrity(disk); + + return (bi && bi->name && strcmp(bi->name, bi_unsupported_name) != 0); +} +EXPORT_SYMBOL(blk_integrity_is_initialized); + /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware @@ -321,9 +331,15 @@ static struct kobj_type integrity_ktype int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) { struct blk_integrity *bi; + static bool seen = false; BUG_ON(disk == NULL); + if (!seen) { + mark_tech_preview("DIF/DIX support", NULL); + seen = true; + } + if (disk->integrity == NULL) { bi = kmem_cache_alloc(integrity_cachep, GFP_KERNEL | __GFP_ZERO); @@ -355,7 +371,9 @@ int blk_integrity_register(struct gendis bi->get_tag_fn = template->get_tag_fn; bi->tag_size = template->tag_size; } else - bi->name = "unsupported"; + bi->name = bi_unsupported_name; + + disk->queue->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; return 0; } @@ -375,12 +393,13 @@ void blk_integrity_unregister(struct gen if (!disk || !disk->integrity) return; + disk->queue->backing_dev_info.capabilities &= ~BDI_CAP_STABLE_WRITES; + bi = disk->integrity; kobject_uevent(&bi->kobj, KOBJ_REMOVE); kobject_del(&bi->kobj); kobject_put(&bi->kobj); - kmem_cache_free(integrity_cachep, bi); disk->integrity = NULL; } EXPORT_SYMBOL(blk_integrity_unregister); diff -uprN linux-2.6.32/block/blk-ioc.c linux-2.6.32.ovz/block/blk-ioc.c --- linux-2.6.32/block/blk-ioc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-ioc.c 2015-06-23 04:16:16.000000000 -0700 @@ -43,7 +43,9 @@ int put_io_context(struct io_context *io ioc->aic->dtor(ioc->aic); cfq_dtor(ioc); rcu_read_unlock(); - +#ifdef CONFIG_BEANCOUNTERS + put_beancounter(ioc->ioc_ub); +#endif kmem_cache_free(iocontext_cachep, ioc); return 1; } @@ -66,23 +68,29 @@ static void cfq_exit(struct io_context * } /* Called by the exitting task */ -void exit_io_context(void) +void exit_io_context(struct task_struct *task) { struct io_context *ioc; - task_lock(current); - ioc = current->io_context; - current->io_context = NULL; - task_unlock(current); + task_lock(task); + ioc = task->io_context; + task->io_context = NULL; + task_unlock(task); + + ioc_task_unlink(ioc); +} +void ioc_task_unlink(struct io_context *ioc) +{ if (atomic_dec_and_test(&ioc->nr_tasks)) { if (ioc->aic && ioc->aic->exit) ioc->aic->exit(ioc->aic); cfq_exit(ioc); - put_io_context(ioc); } + put_io_context(ioc); } +EXPORT_SYMBOL(ioc_task_unlink); struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { @@ -101,6 +109,12 @@ struct io_context *alloc_io_context(gfp_ INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); ret->ioc_data = NULL; +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + ret->cgroup_changed = 0; +#endif +#ifdef CONFIG_BEANCOUNTERS + ret->ioc_ub = get_beancounter(get_exec_ub_top()); +#endif } return ret; @@ -132,6 +146,7 @@ struct io_context *current_io_context(gf return ret; } +EXPORT_SYMBOL(current_io_context); /* * If the current task has no IO context then create one and initialise it. diff -uprN linux-2.6.32/block/blk-lib.c linux-2.6.32.ovz/block/blk-lib.c --- linux-2.6.32/block/blk-lib.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-lib.c 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,198 @@ +/* + * Functions related to generic helpers functions + */ +#include +#include +#include +#include +#include +#include + +#include "blk.h" + +struct bio_batch { + atomic_t done; + unsigned long flags; + struct completion *wait; +}; + +static void bio_batch_end_io(struct bio *bio, int err) +{ + struct bio_batch *bb = bio->bi_private; + + if (err && (err != -EOPNOTSUPP)) + clear_bit(BIO_UPTODATE, &bb->flags); + if (atomic_dec_and_test(&bb->done)) + complete(bb->wait); + bio_put(bio); +} + +/** + * blkdev_issue_discard - queue a discard + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_IFL_* flags to control behaviour + * + * Description: + * Issue a discard request for the sectors in question. + */ +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct request_queue *q = bdev_get_queue(bdev); + int type = (1 << BIO_RW) | (1 << BIO_RW_DISCARD); + sector_t max_discard_sectors; + sector_t granularity, alignment; + struct bio_batch bb; + struct bio *bio; + int ret = 0; + + /* + * DEPRECATED support for DISCARD_FL_BARRIER which will + * fail with -EOPNOTSUPP (due to implicit BIO_RW_BARRIER) + */ + if (flags & DISCARD_FL_BARRIER) + type = DISCARD_BARRIER; + + if (!q) + return -ENXIO; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = bdev_discard_alignment(bdev) >> 9; + alignment = sector_div(alignment, granularity); + + /* + * Ensure that max_discard_sectors is of the proper + * granularity, so that requests stay aligned after a split. + */ + max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); + sector_div(max_discard_sectors, granularity); + max_discard_sectors *= granularity; + if (unlikely(!max_discard_sectors)) { + /* Avoid infinite loop below. Being cautious never hurts. */ + return -EOPNOTSUPP; + } + + atomic_set(&bb.done, 1); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + + while (nr_sects) { + unsigned int req_sects; + sector_t end_sect, tmp; + + bio = bio_alloc(gfp_mask, 1); + if (!bio) { + ret = -ENOMEM; + break; + } + + req_sects = min_t(sector_t, nr_sects, max_discard_sectors); + + /* + * If splitting a request, and the next starting sector would be + * misaligned, stop the discard at the previous aligned sector. + */ + end_sect = sector + req_sects; + tmp = end_sect; + if (req_sects < nr_sects && + sector_div(tmp, granularity) != alignment) { + end_sect = end_sect - alignment; + sector_div(end_sect, granularity); + end_sect = end_sect * granularity + alignment; + req_sects = end_sect - sector; + } + + bio->bi_sector = sector; + bio->bi_end_io = bio_batch_end_io; + bio->bi_bdev = bdev; + bio->bi_private = &bb; + + bio->bi_size = req_sects << 9; + nr_sects -= req_sects; + sector = end_sect; + + atomic_inc(&bb.done); + submit_bio(type, bio); + } + + /* Wait for bios in-flight */ + if (!atomic_dec_and_test(&bb.done)) + wait_for_completion_io(&wait); + + if (!test_bit(BIO_UPTODATE, &bb.flags)) + ret = -EIO; + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_discard); + +/** + * blkdev_issue_zeroout - generate number of zero filed write bios + * @bdev: blockdev to issue + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * + * Description: + * Generate and issue number of bios with zerofiled pages. + */ + +int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask) +{ + int ret; + struct bio *bio; + struct bio_batch bb; + unsigned int sz; + DECLARE_COMPLETION_ONSTACK(wait); + + atomic_set(&bb.done, 1); + bb.flags = 1 << BIO_UPTODATE; + bb.wait = &wait; + + ret = 0; + while (nr_sects != 0) { + bio = bio_alloc(gfp_mask, + min(nr_sects, (sector_t)BIO_MAX_PAGES)); + if (!bio) { + ret = -ENOMEM; + break; + } + + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = bio_batch_end_io; + bio->bi_private = &bb; + + while (nr_sects != 0) { + sz = min((sector_t) PAGE_SIZE >> 9 , nr_sects); + ret = bio_add_page(bio, ZERO_PAGE(0), sz << 9, 0); + nr_sects -= ret >> 9; + sector += ret >> 9; + if (ret < (sz << 9)) + break; + } + ret = 0; + atomic_inc(&bb.done); + submit_bio(WRITE, bio); + } + + /* Wait for bios in-flight */ + if (!atomic_dec_and_test(&bb.done)) + wait_for_completion_io(&wait); + + if (!test_bit(BIO_UPTODATE, &bb.flags)) + /* One of bios in the batch was completed with error.*/ + ret = -EIO; + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_zeroout); diff -uprN linux-2.6.32/block/blk-map.c linux-2.6.32.ovz/block/blk-map.c --- linux-2.6.32/block/blk-map.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-map.c 2015-03-10 13:21:48.000000000 -0700 @@ -201,6 +201,9 @@ int blk_rq_map_user_iov(struct request_q for (i = 0; i < iov_count; i++) { unsigned long uaddr = (unsigned long)iov[i].iov_base; + if (!iov[i].iov_len) + return -EINVAL; + if (uaddr & queue_dma_alignment(q)) { unaligned = 1; break; diff -uprN linux-2.6.32/block/blk-merge.c linux-2.6.32.ovz/block/blk-merge.c --- linux-2.6.32/block/blk-merge.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-merge.c 2015-03-10 13:23:52.000000000 -0700 @@ -180,7 +180,7 @@ new_segment: } if (q->dma_drain_size && q->dma_drain_needed(rq)) { - if (rq->cmd_flags & REQ_RW) + if (rq->cmd_flags & REQ_WRITE) memset(q->dma_drain_buffer, 0, q->dma_drain_size); sg->page_link &= ~0x02; @@ -206,8 +206,7 @@ static inline int ll_new_hw_segment(stru { int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) || - req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) { + if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -225,14 +224,8 @@ static inline int ll_new_hw_segment(stru int ll_back_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - unsigned short max_sectors; - - if (unlikely(blk_pc_request(req))) - max_sectors = queue_max_hw_sectors(q); - else - max_sectors = queue_max_sectors(q); - - if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -249,15 +242,8 @@ int ll_back_merge_fn(struct request_queu int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - unsigned short max_sectors; - - if (unlikely(blk_pc_request(req))) - max_sectors = queue_max_hw_sectors(q); - else - max_sectors = queue_max_sectors(q); - - - if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -288,7 +274,8 @@ static int ll_merge_requests_fn(struct r /* * Will it become too large? */ - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q)) + if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > + blk_rq_get_max_sectors(req)) return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; @@ -300,10 +287,7 @@ static int ll_merge_requests_fn(struct r total_phys_segments--; } - if (total_phys_segments > queue_max_phys_segments(q)) - return 0; - - if (total_phys_segments > queue_max_hw_segments(q)) + if (total_phys_segments > queue_max_segments(q)) return 0; /* Merge is OK... */ @@ -367,6 +351,12 @@ static int attempt_merge(struct request_ return 0; /* + * Don't merge file system requests and discard requests + */ + if ((req->cmd_flags & REQ_DISCARD) != (next->cmd_flags & REQ_DISCARD)) + return 0; + + /* * not contiguous */ if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) @@ -452,3 +442,36 @@ int attempt_front_merge(struct request_q return 0; } + +bool blk_rq_merge_ok(struct request *rq, struct bio *bio) +{ + if (!rq_mergeable(rq) || !bio_mergeable(bio)) + return false; + + /* don't merge file system requests and discard requests */ + if ((bio->bi_rw & BIO_DISCARD) != (rq->bio->bi_rw & BIO_DISCARD)) + return false; + + /* different data direction or already started, don't merge */ + if (bio_data_dir(bio) != rq_data_dir(rq)) + return false; + + /* must be same device and not a special request */ + if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) + return false; + + /* only merge integrity protected bio into ditto rq */ + if (bio_integrity(bio) != blk_integrity_rq(rq)) + return false; + + return true; +} + +int blk_try_merge(struct request *rq, struct bio *bio) +{ + if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_sector) + return ELEVATOR_BACK_MERGE; + else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_sector) + return ELEVATOR_FRONT_MERGE; + return ELEVATOR_NO_MERGE; +} diff -uprN linux-2.6.32/block/blk-settings.c linux-2.6.32.ovz/block/blk-settings.c --- linux-2.6.32/block/blk-settings.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-settings.c 2015-03-10 13:24:15.000000000 -0700 @@ -8,6 +8,8 @@ #include #include /* for max_pfn/max_low_pfn */ #include +#include +#include #include "blk.h" @@ -34,6 +36,23 @@ void blk_queue_prep_rq(struct request_qu EXPORT_SYMBOL(blk_queue_prep_rq); /** + * blk_queue_unprep_rq - set an unprepare_request function for queue + * @q: queue + * @ufn: unprepare_request function + * + * It's possible for a queue to register an unprepare_request callback + * which is invoked before the request is finally completed. The goal + * of the function is to deallocate any data that was allocated in the + * prepare_request callback. + * + */ +void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn) +{ + q->unprep_rq_fn = ufn; +} +EXPORT_SYMBOL(blk_queue_unprep_rq); + +/** * blk_queue_merge_bvec - set a merge_bvec function for queue * @q: queue * @mbfn: merge_bvec_fn @@ -84,19 +103,19 @@ EXPORT_SYMBOL_GPL(blk_queue_lld_busy); * @lim: the queue_limits structure to reset * * Description: - * Returns a queue_limit struct to its default state. Can be used by - * stacking drivers like DM that stage table swaps and reuse an - * existing device queue. + * Returns a queue_limit struct to its default state. */ void blk_set_default_limits(struct queue_limits *lim) { - lim->max_phys_segments = MAX_PHYS_SEGMENTS; - lim->max_hw_segments = MAX_HW_SEGMENTS; + lim->max_segments = BLK_MAX_SEGMENTS; lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - lim->max_segment_size = MAX_SEGMENT_SIZE; - lim->max_sectors = BLK_DEF_MAX_SECTORS; - lim->max_hw_sectors = INT_MAX; - lim->max_discard_sectors = SAFE_MAX_SECTORS; + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; + lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_discard_sectors = 0; + lim->discard_granularity = 0; + lim->discard_alignment = 0; + lim->discard_misaligned = 0; + lim->discard_zeroes_data = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); lim->alignment_offset = 0; @@ -107,6 +126,26 @@ void blk_set_default_limits(struct queue EXPORT_SYMBOL(blk_set_default_limits); /** + * blk_set_stacking_limits - set default limits for stacking devices + * @lim: the queue_limits structure to reset + * + * Description: + * Returns a queue_limit struct to its default state. Should be used + * by stacking drivers like DM that have no internal limits. + */ +void blk_set_stacking_limits(struct queue_limits *lim) +{ + blk_set_default_limits(lim); + + /* Inherit limits from component devices */ + lim->discard_zeroes_data = 1; + lim->max_segments = USHORT_MAX; + lim->max_hw_sectors = UINT_MAX; + lim->max_sectors = UINT_MAX; +} +EXPORT_SYMBOL(blk_set_stacking_limits); + +/** * blk_queue_make_request - define an alternate make_request function for a device * @q: the request queue for the device to be affected * @mfn: the alternate make_request function @@ -141,7 +180,7 @@ void blk_queue_make_request(struct reque q->nr_batching = BLK_BATCH_REQ; q->unplug_thresh = 4; /* hmm */ - q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */ + q->unplug_delay = msecs_to_jiffies(3); /* 3 milliseconds */ if (q->unplug_delay == 0) q->unplug_delay = 1; @@ -149,14 +188,6 @@ void blk_queue_make_request(struct reque q->unplug_timer.data = (unsigned long)q; blk_set_default_limits(&q->limits); - blk_queue_max_sectors(q, SAFE_MAX_SECTORS); - - /* - * If the caller didn't supply a lock, fall back to our embedded - * per-queue locks - */ - if (!q->queue_lock) - q->queue_lock = &q->__queue_lock; /* * by default assume old behaviour and bounce for any highmem page @@ -205,37 +236,46 @@ void blk_queue_bounce_limit(struct reque EXPORT_SYMBOL(blk_queue_bounce_limit); /** - * blk_queue_max_sectors - set max sectors for a request for this queue - * @q: the request queue for the device - * @max_sectors: max sectors in the usual 512b unit + * blk_limits_max_hw_sectors - set hard and soft limit of max sectors for request + * @limits: the queue limits + * @max_hw_sectors: max hardware sectors in the usual 512b unit * * Description: - * Enables a low level driver to set an upper limit on the size of - * received requests. + * Enables a low level driver to set a hard upper limit, + * max_hw_sectors, on the size of requests. max_hw_sectors is set by + * the device driver based upon the combined capabilities of I/O + * controller and storage device. + * + * max_sectors is a soft limit imposed by the block layer for + * filesystem type requests. This value can be overridden on a + * per-device basis in /sys/block//queue/max_sectors_kb. + * The soft limit can not exceed max_hw_sectors. **/ -void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) +void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_sectors) { - if ((max_sectors << 9) < PAGE_CACHE_SIZE) { - max_sectors = 1 << (PAGE_CACHE_SHIFT - 9); + if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) { + max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9); printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_sectors); + __func__, max_hw_sectors); } - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors; - else { - q->limits.max_sectors = BLK_DEF_MAX_SECTORS; - q->limits.max_hw_sectors = max_sectors; - } + limits->max_hw_sectors = max_hw_sectors; + limits->max_sectors = min_t(unsigned int, max_hw_sectors, + BLK_DEF_MAX_SECTORS); } -EXPORT_SYMBOL(blk_queue_max_sectors); +EXPORT_SYMBOL(blk_limits_max_hw_sectors); -void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors) +/** + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * + * Description: + * See description for blk_limits_max_hw_sectors(). + **/ +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) { - if (BLK_DEF_MAX_SECTORS > max_sectors) - q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS; - else - q->limits.max_hw_sectors = max_sectors; + blk_limits_max_hw_sectors(&q->limits, max_hw_sectors); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -252,17 +292,15 @@ void blk_queue_max_discard_sectors(struc EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** - * blk_queue_max_phys_segments - set max phys segments for a request for this queue + * blk_queue_max_segments - set max hw segments for a request for this queue * @q: the request queue for the device * @max_segments: max number of segments * * Description: * Enables a low level driver to set an upper limit on the number of - * physical data segments in a request. This would be the largest sized - * scatter list the driver could handle. + * hw data segments in a request. **/ -void blk_queue_max_phys_segments(struct request_queue *q, - unsigned short max_segments) +void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) { if (!max_segments) { max_segments = 1; @@ -270,33 +308,9 @@ void blk_queue_max_phys_segments(struct __func__, max_segments); } - q->limits.max_phys_segments = max_segments; + q->limits.max_segments = max_segments; } -EXPORT_SYMBOL(blk_queue_max_phys_segments); - -/** - * blk_queue_max_hw_segments - set max hw segments for a request for this queue - * @q: the request queue for the device - * @max_segments: max number of segments - * - * Description: - * Enables a low level driver to set an upper limit on the number of - * hw data segments in a request. This would be the largest number of - * address/length pairs the host adapter can actually give at once - * to the device. - **/ -void blk_queue_max_hw_segments(struct request_queue *q, - unsigned short max_segments) -{ - if (!max_segments) { - max_segments = 1; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_segments); - } - - q->limits.max_hw_segments = max_segments; -} -EXPORT_SYMBOL(blk_queue_max_hw_segments); +EXPORT_SYMBOL(blk_queue_max_segments); /** * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg @@ -351,7 +365,7 @@ EXPORT_SYMBOL(blk_queue_logical_block_si * hardware can operate on without reverting to read-modify-write * operations. */ -void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) +void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) { q->limits.physical_block_size = size; @@ -463,11 +477,6 @@ void blk_queue_io_opt(struct request_que } EXPORT_SYMBOL(blk_queue_io_opt); -/* - * Returns the minimum that is _not_ zero, unless both are zero. - */ -#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) - /** * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers * @t: the stacking driver (top) @@ -490,18 +499,30 @@ EXPORT_SYMBOL(blk_queue_stack_limits); /** * blk_stack_limits - adjust queue_limits for stacked devices - * @t: the stacking driver limits (top) - * @b: the underlying queue limits (bottom) - * @offset: offset to beginning of data within component device - * - * Description: - * Merges two queue_limit structs. Returns 0 if alignment didn't - * change. Returns -1 if adding the bottom device caused - * misalignment. + * @t: the stacking driver limits (top device) + * @b: the underlying queue limits (bottom, component device) + * @start: first data sector within component device + * + * Description: + * This function is used by stacking drivers like MD and DM to ensure + * that all component devices have compatible block sizes and + * alignments. The stacking driver must provide a queue_limits + * struct (top) and then iteratively call the stacking function for + * all component (bottom) devices. The stacking function will + * attempt to combine the values and ensure proper alignment. + * + * Returns 0 if the top and bottom queue_limits are compatible. The + * top device's block sizes and alignment offsets may be adjusted to + * ensure alignment with the bottom device. If no compatible sizes + * and alignments exist, -1 is returned and the resulting top + * queue_limits will have the misaligned flag set to indicate that + * the alignment_offset is undefined. */ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, - sector_t offset) + sector_t start) { + unsigned int top, bottom, alignment, ret = 0; + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); @@ -509,15 +530,31 @@ int blk_stack_limits(struct queue_limits t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); - t->max_phys_segments = min_not_zero(t->max_phys_segments, - b->max_phys_segments); - - t->max_hw_segments = min_not_zero(t->max_hw_segments, - b->max_hw_segments); + t->max_segments = min_not_zero(t->max_segments, b->max_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); + t->misaligned |= b->misaligned; + + alignment = queue_limit_alignment_offset(b, start); + + /* Bottom device has different alignment. Check that it is + * compatible with the current top alignment. + */ + if (t->alignment_offset != alignment) { + + top = max(t->physical_block_size, t->io_min) + + t->alignment_offset; + bottom = max(b->physical_block_size, b->io_min) + alignment; + + /* Verify that top and bottom intervals line up */ + if (max(top, bottom) % min(top, bottom)) { + t->misaligned = 1; + ret = -1; + } + } + t->logical_block_size = max(t->logical_block_size, b->logical_block_size); @@ -525,50 +562,99 @@ int blk_stack_limits(struct queue_limits b->physical_block_size); t->io_min = max(t->io_min, b->io_min); + t->io_opt = lcm(t->io_opt, b->io_opt); + t->no_cluster |= b->no_cluster; + t->discard_zeroes_data &= b->discard_zeroes_data; + + /* Physical block size a multiple of the logical block size? */ + if (t->physical_block_size & (t->logical_block_size - 1)) { + t->physical_block_size = t->logical_block_size; + t->misaligned = 1; + ret = -1; + } - /* Bottom device offset aligned? */ - if (offset && - (offset & (b->physical_block_size - 1)) != b->alignment_offset) { + /* Minimum I/O a multiple of the physical block size? */ + if (t->io_min & (t->physical_block_size - 1)) { + t->io_min = t->physical_block_size; t->misaligned = 1; - return -1; + ret = -1; } - /* If top has no alignment offset, inherit from bottom */ - if (!t->alignment_offset) - t->alignment_offset = - b->alignment_offset & (b->physical_block_size - 1); + /* Optimal I/O a multiple of the physical block size? */ + if (t->io_opt & (t->physical_block_size - 1)) { + t->io_opt = 0; + t->misaligned = 1; + ret = -1; + } - /* Top device aligned on logical block boundary? */ + /* Find lowest common alignment_offset */ + t->alignment_offset = lcm(t->alignment_offset, alignment) + % max(t->physical_block_size, t->io_min); + + /* Verify that new alignment_offset is on a logical block boundary */ if (t->alignment_offset & (t->logical_block_size - 1)) { t->misaligned = 1; - return -1; + ret = -1; } - /* Find lcm() of optimal I/O size */ - if (t->io_opt && b->io_opt) - t->io_opt = (t->io_opt * b->io_opt) / gcd(t->io_opt, b->io_opt); - else if (b->io_opt) - t->io_opt = b->io_opt; - - /* Verify that optimal I/O size is a multiple of io_min */ - if (t->io_min && t->io_opt % t->io_min) - return -1; + /* Discard alignment and granularity */ + if (b->discard_granularity) { + alignment = queue_limit_discard_alignment(b, start); - return 0; + if (t->discard_granularity != 0 && + t->discard_alignment != alignment) { + top = t->discard_granularity + t->discard_alignment; + bottom = b->discard_granularity + alignment; + + /* Verify that top and bottom intervals line up */ + if ((max(top, bottom) % min(top, bottom)) != 0) + t->discard_misaligned = 1; + } + + t->max_discard_sectors = min_not_zero(t->max_discard_sectors, + b->max_discard_sectors); + t->discard_granularity = max(t->discard_granularity, + b->discard_granularity); + t->discard_alignment = lcm(t->discard_alignment, alignment) % + t->discard_granularity; + } + + return ret; } EXPORT_SYMBOL(blk_stack_limits); /** + * bdev_stack_limits - adjust queue limits for stacked drivers + * @t: the stacking driver limits (top device) + * @bdev: the component block_device (bottom) + * @start: first data sector within component device + * + * Description: + * Merges queue limits for a top device and a block_device. Returns + * 0 if alignment didn't change. Returns -1 if adding the bottom + * device caused misalignment. + */ +int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, + sector_t start) +{ + struct request_queue *bq = bdev_get_queue(bdev); + + start += get_start_sect(bdev); + + return blk_stack_limits(t, &bq->limits, start); +} +EXPORT_SYMBOL(bdev_stack_limits); + +/** * disk_stack_limits - adjust queue limits for stacked drivers * @disk: MD/DM gendisk (top) * @bdev: the underlying block device (bottom) * @offset: offset to beginning of data within component device * * Description: - * Merges the limits for two queues. Returns 0 if alignment - * didn't change. Returns -1 if adding the bottom device caused - * misalignment. + * Merges the limits for a top level gendisk and a bottom level + * block_device. */ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset) @@ -576,9 +662,7 @@ void disk_stack_limits(struct gendisk *d struct request_queue *t = disk->queue; struct request_queue *b = bdev_get_queue(bdev); - offset += get_start_sect(bdev) << 9; - - if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { + if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; disk_name(disk, 0, top); @@ -650,22 +734,19 @@ EXPORT_SYMBOL(blk_queue_update_dma_pad); * does is adjust the queue so that the buf is always appended * silently to the scatterlist. * - * Note: This routine adjusts max_hw_segments to make room for - * appending the drain buffer. If you call - * blk_queue_max_hw_segments() or blk_queue_max_phys_segments() after - * calling this routine, you must set the limit to one fewer than your - * device can support otherwise there won't be room for the drain - * buffer. + * Note: This routine adjusts max_hw_segments to make room for appending + * the drain buffer. If you call blk_queue_max_segments() after calling + * this routine, you must set the limit to one fewer than your device + * can support otherwise there won't be room for the drain buffer. */ int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size) { - if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2) + if (queue_max_segments(q) < 2) return -EINVAL; /* make room for appending the drain */ - blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1); - blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1); + blk_queue_max_segments(q, queue_max_segments(q) - 1); q->dma_drain_needed = dma_drain_needed; q->dma_drain_buffer = buf; q->dma_drain_size = size; @@ -730,6 +811,85 @@ void blk_queue_update_dma_alignment(stru } EXPORT_SYMBOL(blk_queue_update_dma_alignment); +/** + * blk_queue_flush - configure queue's cache flush capability + * @q: the request queue for the device + * @flush: 0, REQ_FLUSH or REQ_FLUSH | REQ_FUA + * + * Tell block layer cache flush capability of @q. If it supports + * flushing, REQ_FLUSH should be set. If it supports bypassing + * write cache for individual writes, REQ_FUA should be set. + */ +void blk_queue_flush(struct request_queue *q, unsigned int flush) +{ + WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA)); + + if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA))) + flush &= ~REQ_FUA; + + q->flush_flags = flush & (REQ_FLUSH | REQ_FUA); +} +EXPORT_SYMBOL_GPL(blk_queue_flush); + +/** + * blk_queue_ordered - does this queue support ordered writes + * @q: the request queue + * @ordered: one of QUEUE_ORDERED_* + * @prepare_flush_fn: rq setup helper for cache flush ordered writes + * + * Description: + * This is old method and has been retained for kABI compatibility. Any + * new driver should be using and sticking to semantics of blk_queue_flush(). + **/ +int blk_queue_ordered(struct request_queue *q, unsigned ordered, + prepare_flush_fn *prepare_flush_fn) +{ + if (!prepare_flush_fn && (ordered & (QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_POSTFLUSH))) { + printk(KERN_ERR "%s: prepare_flush_fn required\n", __func__); + return -EINVAL; + } + + /* + * Note: QUEUE_ORDERED_TAG* variants are not supported as it never + * worked reliably. + */ + + if (ordered != QUEUE_ORDERED_NONE && + ordered != QUEUE_ORDERED_DRAIN && + ordered != QUEUE_ORDERED_DRAIN_FLUSH && + ordered != QUEUE_ORDERED_DRAIN_FUA && + ordered != QUEUE_ORDERED_TAG && + ordered != QUEUE_ORDERED_TAG_FLUSH && + ordered != QUEUE_ORDERED_TAG_FUA) { + printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered); + return -EINVAL; + } + + q->ordered = ordered; + q->next_ordered = ordered; + q->prepare_flush_fn = prepare_flush_fn; + + /* Map old queue values to new flush/fua capabilities */ + + if (ordered == QUEUE_ORDERED_DRAIN_FLUSH || + ordered == QUEUE_ORDERED_TAG_FLUSH) + q->flush_flags = REQ_FLUSH; + + if (ordered == QUEUE_ORDERED_DRAIN_FUA || + ordered == QUEUE_ORDERED_TAG_FUA) + q->flush_flags = (REQ_FLUSH | REQ_FUA); + + return 0; +} +EXPORT_SYMBOL(blk_queue_ordered); + +void blk_queue_flush_queueable(struct request_queue *q, bool queueable) +{ + q->flush_not_queueable = !queueable; +} +EXPORT_SYMBOL_GPL(blk_queue_flush_queueable); + static int __init blk_settings_init(void) { blk_max_low_pfn = max_low_pfn - 1; diff -uprN linux-2.6.32/block/blk-softirq.c linux-2.6.32.ovz/block/blk-softirq.c --- linux-2.6.32/block/blk-softirq.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-softirq.c 2015-03-10 13:22:50.000000000 -0700 @@ -103,24 +103,35 @@ static struct notifier_block __cpuinitda void __blk_complete_request(struct request *req) { + int ccpu, cpu, group_cpu = NR_CPUS; struct request_queue *q = req->q; unsigned long flags; - int ccpu, cpu, group_cpu; BUG_ON(!q->softirq_done_fn); local_irq_save(flags); cpu = smp_processor_id(); - group_cpu = blk_cpu_to_group(cpu); /* * Select completion CPU */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1) + if (req->cpu != -1) { ccpu = req->cpu; - else + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) { + ccpu = blk_cpu_to_group(ccpu); + group_cpu = blk_cpu_to_group(cpu); + } + } else ccpu = cpu; + /* + * If current CPU and requested CPU ar in the same group, running + * sortirq in current CPU. One might concern this is just like + * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is + * running in interrupt handler, and currently I/O controller doesn't + * support multiple interrupts, so current CPU is unique actually. This + * avoids IPI sending from current CPU to the first CPU of a group. + */ if (ccpu == cpu || ccpu == group_cpu) { struct list_head *list; do_local: diff -uprN linux-2.6.32/block/blk-sysfs.c linux-2.6.32.ovz/block/blk-sysfs.c --- linux-2.6.32/block/blk-sysfs.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-sysfs.c 2015-03-10 13:23:48.000000000 -0700 @@ -106,6 +106,19 @@ static ssize_t queue_max_sectors_show(st return queue_var_show(max_sectors_kb, (page)); } +static ssize_t queue_max_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_segments(q), (page)); +} + +static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags)) + return queue_var_show(queue_max_segment_size(q), (page)); + + return queue_var_show(PAGE_CACHE_SIZE, (page)); +} + static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) { return queue_var_show(queue_logical_block_size(q), page); @@ -126,6 +139,22 @@ static ssize_t queue_io_opt_show(struct return queue_var_show(queue_io_opt(q), page); } +static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.discard_granularity, page); +} + +static ssize_t queue_discard_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_discard_sectors << 9); +} + +static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_discard_zeroes_data(q), page); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -151,6 +180,31 @@ static ssize_t queue_max_hw_sectors_show return queue_var_show(max_hw_sectors_kb, (page)); } +static ssize_t +queue_show_unpriv_sgio(struct request_queue *q, char *page) +{ + int bit; + bit = test_bit(QUEUE_FLAG_UNPRIV_SGIO, &q->queue_flags); + return queue_var_show(bit, page); +} +static ssize_t +queue_store_unpriv_sgio(struct request_queue *q, const char *page, size_t count) +{ + unsigned long val; + ssize_t ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + ret = queue_var_store(&val, page, count); + spin_lock_irq(q->queue_lock); + if (val) + queue_flag_set(QUEUE_FLAG_UNPRIV_SGIO, q); + else + queue_flag_clear(QUEUE_FLAG_UNPRIV_SGIO, q); + spin_unlock_irq(q->queue_lock); + return ret; +} static ssize_t queue_nonrot_show(struct request_queue *q, char *page) { return queue_var_show(!blk_queue_nonrot(q), page); @@ -196,8 +250,9 @@ static ssize_t queue_nomerges_store(stru static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) { bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); - return queue_var_show(set, page); + return queue_var_show(set << force, page); } static ssize_t @@ -209,15 +264,42 @@ queue_rq_affinity_store(struct request_q ret = queue_var_store(&val, page, count); spin_lock_irq(q->queue_lock); - if (val) + if (val == 2) { queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - else - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 1) { + queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 0) { + queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } spin_unlock_irq(q->queue_lock); #endif return ret; } +static ssize_t queue_random_show(struct request_queue *q, char *page) +{ + return queue_var_show(blk_queue_add_random(q), page); +} + +static ssize_t queue_random_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long val; + ssize_t ret = queue_var_store(&val, page, count); + + spin_lock_irq(q->queue_lock); + if (val) + queue_flag_set(QUEUE_FLAG_ADD_RANDOM, q); + else + queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static ssize_t queue_iostats_show(struct request_queue *q, char *page) { return queue_var_show(blk_queue_io_stat(q), page); @@ -262,6 +344,16 @@ static struct queue_sysfs_entry queue_ma .show = queue_max_hw_sectors_show, }; +static struct queue_sysfs_entry queue_max_segments_entry = { + .attr = {.name = "max_segments", .mode = S_IRUGO }, + .show = queue_max_segments_show, +}; + +static struct queue_sysfs_entry queue_max_segment_size_entry = { + .attr = {.name = "max_segment_size", .mode = S_IRUGO }, + .show = queue_max_segment_size_show, +}; + static struct queue_sysfs_entry queue_iosched_entry = { .attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR }, .show = elv_iosched_show, @@ -293,12 +385,33 @@ static struct queue_sysfs_entry queue_io .show = queue_io_opt_show, }; +static struct queue_sysfs_entry queue_discard_granularity_entry = { + .attr = {.name = "discard_granularity", .mode = S_IRUGO }, + .show = queue_discard_granularity_show, +}; + +static struct queue_sysfs_entry queue_discard_max_entry = { + .attr = {.name = "discard_max_bytes", .mode = S_IRUGO }, + .show = queue_discard_max_show, +}; + +static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { + .attr = {.name = "discard_zeroes_data", .mode = S_IRUGO }, + .show = queue_discard_zeroes_data_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_nonrot_show, .store = queue_nonrot_store, }; +static struct queue_sysfs_entry queue_unpriv_sgio_entry = { + .attr = {.name = "unpriv_sgio", .mode = S_IRUGO | S_IWUSR }, + .show = queue_show_unpriv_sgio, + .store = queue_store_unpriv_sgio, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR }, .show = queue_nomerges_show, @@ -317,21 +430,34 @@ static struct queue_sysfs_entry queue_io .store = queue_iostats_store, }; +static struct queue_sysfs_entry queue_random_entry = { + .attr = {.name = "add_random", .mode = S_IRUGO | S_IWUSR }, + .show = queue_random_show, + .store = queue_random_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, + &queue_max_segments_entry.attr, + &queue_max_segment_size_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, &queue_io_min_entry.attr, &queue_io_opt_entry.attr, + &queue_discard_granularity_entry.attr, + &queue_discard_max_entry.attr, + &queue_discard_zeroes_data_entry.attr, + &queue_unpriv_sgio_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, + &queue_random_entry.attr, NULL, }; @@ -348,7 +474,7 @@ queue_attr_show(struct kobject *kobj, st if (!entry->show) return -EIO; mutex_lock(&q->sysfs_lock); - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + if (blk_queue_dead(q)) { mutex_unlock(&q->sysfs_lock); return -ENOENT; } @@ -370,7 +496,7 @@ queue_attr_store(struct kobject *kobj, s q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); - if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) { + if (blk_queue_dead(q)) { mutex_unlock(&q->sysfs_lock); return -ENOENT; } @@ -402,19 +528,25 @@ static void blk_release_queue(struct kob blk_sync_queue(q); + if (q->elevator) + elevator_exit(q->elevator); + + blk_throtl_exit(q); + if (rl->rq_pool) mempool_destroy(rl->rq_pool); if (q->queue_tags) __blk_queue_free_tags(q); + blk_throtl_release(q); blk_trace_shutdown(q); bdi_destroy(&q->backing_dev_info); kmem_cache_free(blk_requestq_cachep, q); } -static struct sysfs_ops queue_sysfs_ops = { +static const struct sysfs_ops queue_sysfs_ops = { .show = queue_attr_show, .store = queue_attr_store, }; diff -uprN linux-2.6.32/block/blk-throttle.c linux-2.6.32.ovz/block/blk-throttle.c --- linux-2.6.32/block/blk-throttle.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-throttle.c 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,1331 @@ +/* + * Interface for controlling IO bandwidth on a request queue + * + * Copyright (C) 2010 Vivek Goyal + */ + +#include +#include +#include +#include +#include +#include "blk-cgroup.h" +#include "blk.h" + +/* Max dispatch from a group in 1 round */ +static int throtl_grp_quantum = 8; + +/* Total max dispatch from all groups in one round */ +static int throtl_quantum = 32; + +/* Throttling is performed over 100ms slice and after that slice is renewed */ +static unsigned long throtl_slice = HZ/10; /* 100 ms */ + +/* A workqueue to queue throttle related work */ +static struct workqueue_struct *kthrotld_workqueue; +static void throtl_schedule_delayed_work(struct throtl_data *td, + unsigned long delay); + +struct throtl_rb_root { + struct rb_root rb; + struct rb_node *left; + unsigned int count; + unsigned long min_disptime; +}; + +#define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ + .count = 0, .min_disptime = 0} + +#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) + +struct throtl_grp { + /* List of throtl groups on the request queue*/ + struct hlist_node tg_node; + + /* active throtl group service_tree member */ + struct rb_node rb_node; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + struct blkio_group blkg; + atomic_t ref; + unsigned int flags; + + /* Two lists for READ and WRITE */ + struct bio_list bio_lists[2]; + + /* Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* bytes per second rate limits */ + uint64_t bps[2]; + + /* IOPS limits */ + unsigned int iops[2]; + + /* Number of bytes disptached in current slice */ + uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; + + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; + + /* Some throttle limits got updated for the group */ + int limits_changed; + + struct rcu_head rcu_head; +}; + +struct throtl_data +{ + /* List of throtl groups */ + struct hlist_head tg_list; + + /* service tree for active throtl groups */ + struct throtl_rb_root tg_service_tree; + + struct throtl_grp *root_tg; + struct request_queue *queue; + + /* Total Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + /* + * number of total undestroyed groups + */ + unsigned int nr_undestroyed_grps; + + /* Work for dispatching throttled bios */ + struct delayed_work throtl_work; + + int limits_changed; +}; + +enum tg_state_flags { + THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ +}; + +#define THROTL_TG_FNS(name) \ +static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ +} \ +static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ +{ \ + (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ +} \ +static inline int throtl_tg_##name(const struct throtl_grp *tg) \ +{ \ + return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ +} + +THROTL_TG_FNS(on_rr); + +#define throtl_log_tg(td, tg, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl %s " fmt, \ + blkg_path(&(tg)->blkg), ##args); \ + +#define throtl_log(td, fmt, args...) \ + blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) + +static inline struct throtl_grp *tg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct throtl_grp, blkg); + + return NULL; +} + +static inline int total_nr_queued(struct throtl_data *td) +{ + return (td->nr_queued[0] + td->nr_queued[1]); +} + +static inline struct throtl_grp *throtl_ref_get_tg(struct throtl_grp *tg) +{ + atomic_inc(&tg->ref); + return tg; +} + +static void throtl_free_tg(struct rcu_head *head) +{ + struct throtl_grp *tg; + + tg = container_of(head, struct throtl_grp, rcu_head); + blkio_free_blkg_stats(&tg->blkg); + kfree(tg); +} + +static void throtl_put_tg(struct throtl_grp *tg) +{ + BUG_ON(atomic_read(&tg->ref) <= 0); + if (!atomic_dec_and_test(&tg->ref)) + return; + + /* + * A group is freed in rcu manner. But having an rcu lock does not + * mean that one can access all the fields of blkg and assume these + * are valid. For example, don't try to follow throtl_data and + * request queue links. + * + * Having a reference to blkg under an rcu allows acess to only + * values local to groups like group stats and group rate limits + */ + call_rcu(&tg->rcu_head, throtl_free_tg); +} + +static void throtl_init_group(struct throtl_grp *tg) +{ + INIT_HLIST_NODE(&tg->tg_node); + RB_CLEAR_NODE(&tg->rb_node); + bio_list_init(&tg->bio_lists[0]); + bio_list_init(&tg->bio_lists[1]); + tg->limits_changed = false; + + /* Practically unlimited BW */ + tg->bps[0] = tg->bps[1] = -1; + tg->iops[0] = tg->iops[1] = -1; + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * request queue which will be dropped by either request queue + * exit or cgroup deletion path depending on who is exiting first. + */ + atomic_set(&tg->ref, 1); +} + +/* Should be called with rcu read lock held (needed for blkcg) */ +static void +throtl_add_group_to_td_list(struct throtl_data *td, struct throtl_grp *tg) +{ + hlist_add_head(&tg->tg_node, &td->tg_list); + td->nr_undestroyed_grps++; +} + +static void +__throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) +{ + struct backing_dev_info *bdi = &td->queue->backing_dev_info; + unsigned int major, minor; + + if (!tg || tg->blkg.dev) + return; + + /* + * Fill in device details for a group which might not have been + * filled at group creation time as queue was being instantiated + * and driver had not attached a device yet + */ + if (bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + tg->blkg.dev = MKDEV(major, minor); + } +} + +/* + * Should be called with without queue lock held. Here queue lock will be + * taken rarely. It will be taken only once during life time of a group + * if need be + */ +static void +throtl_tg_fill_dev_details(struct throtl_data *td, struct throtl_grp *tg) +{ + if (!tg || tg->blkg.dev) + return; + + spin_lock_irq(td->queue->queue_lock); + __throtl_tg_fill_dev_details(td, tg); + spin_unlock_irq(td->queue->queue_lock); +} + +static void throtl_init_add_tg_lists(struct throtl_data *td, + struct throtl_grp *tg, struct blkio_cgroup *blkcg) +{ + __throtl_tg_fill_dev_details(td, tg); + + /* Add group onto cgroup list */ + blkiocg_add_blkio_group(blkcg, &tg->blkg, (void *)td, + tg->blkg.dev, BLKIO_POLICY_THROTL); + + tg->bps[READ] = blkcg_get_read_bps(blkcg, tg->blkg.dev); + tg->bps[WRITE] = blkcg_get_write_bps(blkcg, tg->blkg.dev); + tg->iops[READ] = blkcg_get_read_iops(blkcg, tg->blkg.dev); + tg->iops[WRITE] = blkcg_get_write_iops(blkcg, tg->blkg.dev); + + throtl_add_group_to_td_list(td, tg); +} + +/* Should be called without queue lock and outside of rcu period */ +static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td) +{ + struct throtl_grp *tg = NULL; + int ret; + + tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node); + if (!tg) + return NULL; + + ret = blkio_alloc_blkg_stats(&tg->blkg); + + if (ret) { + kfree(tg); + return NULL; + } + + throtl_init_group(tg); + return tg; +} + +static struct +throtl_grp *throtl_find_tg(struct throtl_data *td, struct blkio_cgroup *blkcg) +{ + struct throtl_grp *tg = NULL; + void *key = td; + + /* + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case + */ + if (blkcg == &blkio_root_cgroup) + tg = td->root_tg; + else + tg = tg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + __throtl_tg_fill_dev_details(td, tg); + return tg; +} + +static struct throtl_grp * throtl_get_tg(struct throtl_data *td) +{ + struct throtl_grp *tg = NULL, *__tg = NULL; + struct blkio_cgroup *blkcg; + struct request_queue *q = td->queue; + + /* no throttling for dead queue */ + if (unlikely(blk_queue_dead(q))) + return NULL; + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + tg = throtl_find_tg(td, blkcg); + if (tg) { + rcu_read_unlock(); + return tg; + } + + /* + * Need to allocate a group. Allocation of group also needs allocation + * of per cpu stats which in-turn takes a mutex() and can block. Hence + * we need to drop rcu lock and queue_lock before we call alloc. + */ + rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); + + tg = throtl_alloc_tg(td); + + /* Group allocated and queue is still alive. take the lock */ + spin_lock_irq(q->queue_lock); + + /* Make sure @q is still alive */ + if (unlikely(blk_queue_dead(q))) { + throtl_free_tg(&tg->rcu_head); + return NULL; + } + + /* + * Initialize the new group. After sleeping, read the blkcg again. + */ + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + + /* + * If some other thread already allocated the group while we were + * not holding queue lock, free up the group + */ + __tg = throtl_find_tg(td, blkcg); + + if (__tg) { + throtl_free_tg(&tg->rcu_head); + rcu_read_unlock(); + return __tg; + } + + /* Group allocation failed. Account the IO to root group */ + if (!tg) { + tg = td->root_tg; + return tg; + } + + throtl_init_add_tg_lists(td, tg, blkcg); + rcu_read_unlock(); + return tg; +} + +static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) +{ + /* Service tree is empty */ + if (!root->count) + return NULL; + + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_tg(root->left); + + return NULL; +} + +static void rb_erase_init(struct rb_node *n, struct rb_root *root) +{ + rb_erase(n, root); + RB_CLEAR_NODE(n); +} + +static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) +{ + if (root->left == n) + root->left = NULL; + rb_erase_init(n, &root->rb); + --root->count; +} + +static void update_min_dispatch_time(struct throtl_rb_root *st) +{ + struct throtl_grp *tg; + + tg = throtl_rb_first(st); + if (!tg) + return; + + st->min_disptime = tg->disptime; +} + +static void +tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct throtl_grp *__tg; + unsigned long key = tg->disptime; + int left = 1; + + while (*node != NULL) { + parent = *node; + __tg = rb_entry_tg(parent); + + if (time_before(key, __tg->disptime)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &tg->rb_node; + + rb_link_node(&tg->rb_node, parent, node); + rb_insert_color(&tg->rb_node, &st->rb); +} + +static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + tg_service_tree_add(st, tg); + throtl_mark_tg_on_rr(tg); + st->count++; +} + +static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (!throtl_tg_on_rr(tg)) + __throtl_enqueue_tg(td, tg); +} + +static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); + throtl_clear_tg_on_rr(tg); +} + +static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + if (throtl_tg_on_rr(tg)) + __throtl_dequeue_tg(td, tg); +} + +static void throtl_schedule_next_dispatch(struct throtl_data *td) +{ + struct throtl_rb_root *st = &td->tg_service_tree; + + /* + * If there are more bios pending, schedule more work. + */ + if (!total_nr_queued(td)) + return; + + BUG_ON(!st->count); + + update_min_dispatch_time(st); + + if (time_before_eq(st->min_disptime, jiffies)) + throtl_schedule_delayed_work(td, 0); + else + throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); +} + +static inline void +throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + tg->slice_start[rw] = jiffies; + tg->slice_end[rw] = jiffies + throtl_slice; + throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +static inline void throtl_set_slice_end(struct throtl_data *td, + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); +} + +static inline void throtl_extend_slice(struct throtl_data *td, + struct throtl_grp *tg, bool rw, unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); + throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +/* Determine if previously allocated or extended slice is complete or not */ +static bool +throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) + return 0; + + return 1; +} + +/* Trim the used slices and adjust slice start accordingly */ +static inline void +throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) +{ + unsigned long nr_slices, time_elapsed, io_trim; + u64 bytes_trim, tmp; + + BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); + + /* + * If bps are unlimited (-1), then time slice don't get + * renewed. Don't try to trim the slice if slice is used. A new + * slice will start when appropriate. + */ + if (throtl_slice_used(td, tg, rw)) + return; + + /* + * A bio has been dispatched. Also adjust slice_end. It might happen + * that initially cgroup limit was very low resulting in high + * slice_end, but later limit was bumped up and bio was dispached + * sooner, then we need to reduce slice_end. A high bogus slice_end + * is bad because it does not allow new slice to start. + */ + + throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); + + time_elapsed = jiffies - tg->slice_start[rw]; + + nr_slices = time_elapsed / throtl_slice; + + if (!nr_slices) + return; + tmp = tg->bps[rw] * throtl_slice * nr_slices; + do_div(tmp, HZ); + bytes_trim = tmp; + + io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; + + if (!bytes_trim && !io_trim) + return; + + if (tg->bytes_disp[rw] >= bytes_trim) + tg->bytes_disp[rw] -= bytes_trim; + else + tg->bytes_disp[rw] = 0; + + if (tg->io_disp[rw] >= io_trim) + tg->io_disp[rw] -= io_trim; + else + tg->io_disp[rw] = 0; + + tg->slice_start[rw] += nr_slices * throtl_slice; + + throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" + " start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, + tg->slice_start[rw], tg->slice_end[rw], jiffies); +} + +static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned int io_allowed; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + u64 tmp; + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + /* + * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * 1 then at max jiffy elapsed should be equivalent of 1 second as we + * will allow dispatch after 1 second and after that slice should + * have been trimmed. + */ + + tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); + + if (tmp > UINT_MAX) + io_allowed = UINT_MAX; + else + io_allowed = tmp; + + if (tg->io_disp[rw] + 1 <= io_allowed) { + if (wait) + *wait = 0; + return 1; + } + + /* Calc approx time to dispatch */ + jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; + + if (jiffy_wait > jiffy_elapsed) + jiffy_wait = jiffy_wait - jiffy_elapsed; + else + jiffy_wait = 1; + + if (wait) + *wait = jiffy_wait; + return 0; +} + +static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + u64 bytes_allowed, extra_bytes, tmp; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); + + tmp = tg->bps[rw] * jiffy_elapsed_rnd; + do_div(tmp, HZ); + bytes_allowed = tmp; + + if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { + if (wait) + *wait = 0; + return 1; + } + + /* Calc approx time to dispatch */ + extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; + jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); + + if (!jiffy_wait) + jiffy_wait = 1; + + /* + * This wait time is without taking into consideration the rounding + * up we did. Add that time also. + */ + jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); + if (wait) + *wait = jiffy_wait; + return 0; +} + +static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) + return 1; + return 0; +} + +/* + * Returns whether one can dispatch a bio or not. Also returns approx number + * of jiffies to wait before this bio is with-in IO rate and can be dispatched + */ +static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + + /* + * Currently whole state machine of group depends on first bio + * queued in the group bio list. So one should not be calling + * this function with a different bio if there are other bios + * queued. + */ + BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); + + /* If tg->bps = -1, then BW is unlimited */ + if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { + if (wait) + *wait = 0; + return 1; + } + + /* + * If previous slice expired, start a new one otherwise renew/extend + * existing slice to make sure it is at least throtl_slice interval + * long since now. + */ + if (throtl_slice_used(td, tg, rw)) + throtl_start_new_slice(td, tg, rw); + else { + if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) + throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); + } + + if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) + && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { + if (wait) + *wait = 0; + return 1; + } + + max_wait = max(bps_wait, iops_wait); + + if (wait) + *wait = max_wait; + + if (time_before(tg->slice_end[rw], jiffies + max_wait)) + throtl_extend_slice(td, tg, rw, jiffies + max_wait); + + return 0; +} + +static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + bool sync = bio->bi_rw & REQ_SYNC; + + /* Charge the bio to the group */ + tg->bytes_disp[rw] += bio->bi_size; + tg->io_disp[rw]++; + + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, rw, sync); +} + +static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio *bio) +{ + bool rw = bio_data_dir(bio); + + bio_list_add(&tg->bio_lists[rw], bio); + /* Take a bio reference on tg */ + throtl_ref_get_tg(tg); + tg->nr_queued[rw]++; + td->nr_queued[rw]++; + throtl_enqueue_tg(td, tg); +} + +static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) +{ + unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + struct bio *bio; + + if ((bio = bio_list_peek(&tg->bio_lists[READ]))) + tg_may_dispatch(td, tg, bio, &read_wait); + + if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) + tg_may_dispatch(td, tg, bio, &write_wait); + + min_wait = min(read_wait, write_wait); + disptime = jiffies + min_wait; + + /* Update dispatch time */ + throtl_dequeue_tg(td, tg); + tg->disptime = disptime; + throtl_enqueue_tg(td, tg); +} + +static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, + bool rw, struct bio_list *bl) +{ + struct bio *bio; + + bio = bio_list_pop(&tg->bio_lists[rw]); + tg->nr_queued[rw]--; + /* Drop bio reference on tg */ + throtl_put_tg(tg); + + BUG_ON(td->nr_queued[rw] <= 0); + td->nr_queued[rw]--; + + throtl_charge_bio(tg, bio); + bio_list_add(bl, bio); + bio->bi_rw |= (1 << BIO_RW_THROTTLED); + + throtl_trim_slice(td, tg, rw); +} + +static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, + struct bio_list *bl) +{ + unsigned int nr_reads = 0, nr_writes = 0; + unsigned int max_nr_reads = throtl_grp_quantum*3/4; + unsigned int max_nr_writes = throtl_grp_quantum - nr_reads; + struct bio *bio; + + /* Try to dispatch 75% READS and 25% WRITES */ + + while ((bio = bio_list_peek(&tg->bio_lists[READ])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_reads++; + + if (nr_reads >= max_nr_reads) + break; + } + + while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) + && tg_may_dispatch(td, tg, bio, NULL)) { + + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); + nr_writes++; + + if (nr_writes >= max_nr_writes) + break; + } + + return nr_reads + nr_writes; +} + +static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) +{ + unsigned int nr_disp = 0; + struct throtl_grp *tg; + struct throtl_rb_root *st = &td->tg_service_tree; + + while (1) { + tg = throtl_rb_first(st); + + if (!tg) + break; + + if (time_before(jiffies, tg->disptime)) + break; + + throtl_dequeue_tg(td, tg); + + nr_disp += throtl_dispatch_tg(td, tg, bl); + + if (tg->nr_queued[0] || tg->nr_queued[1]) { + tg_update_disptime(td, tg); + throtl_enqueue_tg(td, tg); + } + + if (nr_disp >= throtl_quantum) + break; + } + + return nr_disp; +} + +static void throtl_process_limit_change(struct throtl_data *td) +{ + struct throtl_grp *tg; + struct hlist_node *pos, *n; + int ret; + + if (!td->limits_changed) + return; + + ret = xchg(&td->limits_changed, false); + + throtl_log(td, "limits changed"); + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + if (!tg->limits_changed) + continue; + + if (!xchg(&tg->limits_changed, false)) + continue; + + throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" + " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], + tg->iops[READ], tg->iops[WRITE]); + + /* + * Restart the slices for both READ and WRITES. It + * might happen that a group's limit are dropped + * suddenly and we don't want to account recently + * dispatched IO with new low rate + */ + throtl_start_new_slice(td, tg, 0); + throtl_start_new_slice(td, tg, 1); + + if (throtl_tg_on_rr(tg)) + tg_update_disptime(td, tg); + } +} + +/* Dispatch throttled bios. Should be called without queue lock held. */ +static int throtl_dispatch(struct request_queue *q) +{ + struct throtl_data *td = q->td; + unsigned int nr_disp = 0; + struct bio_list bio_list_on_stack; + struct bio *bio; + + spin_lock_irq(q->queue_lock); + + throtl_process_limit_change(td); + + if (!total_nr_queued(td)) + goto out; + + bio_list_init(&bio_list_on_stack); + + throtl_log(td, "dispatch nr_queued=%lu read=%u write=%u", + total_nr_queued(td), td->nr_queued[READ], + td->nr_queued[WRITE]); + + nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); + + if (nr_disp) + throtl_log(td, "bios disp=%u", nr_disp); + + throtl_schedule_next_dispatch(td); +out: + spin_unlock_irq(q->queue_lock); + + /* + * If we dispatched some requests, unplug the queue to make sure + * immediate dispatch + */ + if (nr_disp) { + while((bio = bio_list_pop(&bio_list_on_stack))) + generic_make_request(bio); + blk_unplug(q); + } + return nr_disp; +} + +void blk_throtl_work(struct work_struct *work) +{ + struct throtl_data *td = container_of(work, struct throtl_data, + throtl_work.work); + struct request_queue *q = td->queue; + + throtl_dispatch(q); +} + +/* Call with queue lock held */ +static void +throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) +{ + + struct delayed_work *dwork = &td->throtl_work; + + /* schedule work if limits changed even if no bio is queued */ + if (total_nr_queued(td) > 0 || td->limits_changed) { + /* + * We might have a work scheduled to be executed in future. + * Cancel that and schedule a new one. + */ + __cancel_delayed_work(dwork); + queue_delayed_work(kthrotld_workqueue, dwork, delay); + throtl_log(td, "schedule work. delay=%lu jiffies=%lu", + delay, jiffies); + } +} + +static void +throtl_destroy_tg(struct throtl_data *td, struct throtl_grp *tg) +{ + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&tg->tg_node)); + + hlist_del_init(&tg->tg_node); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + throtl_put_tg(tg); + td->nr_undestroyed_grps--; +} + +static void throtl_release_tgs(struct throtl_data *td) +{ + struct hlist_node *pos, *n; + struct throtl_grp *tg; + + hlist_for_each_entry_safe(tg, pos, n, &td->tg_list, tg_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!blkiocg_del_blkio_group(&tg->blkg)) + throtl_destroy_tg(td, tg); + } +} + +/* + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid throtl_data pointer as long as we are + * rcu read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if queue was going away, cgroup deltion + * path got to it first. + */ +void throtl_unlink_blkio_group(void *key, struct blkio_group *blkg) +{ + unsigned long flags; + struct throtl_data *td = key; + + spin_lock_irqsave(td->queue->queue_lock, flags); + throtl_destroy_tg(td, tg_of_blkg(blkg)); + spin_unlock_irqrestore(td->queue->queue_lock, flags); +} + +static void throtl_update_blkio_group_common(struct throtl_data *td, + struct throtl_grp *tg) +{ + int ret; + + ret = xchg(&tg->limits_changed, true); + ret = xchg(&td->limits_changed, true); + /* Schedule a work now to process the limit change */ + throtl_schedule_delayed_work(td, 0); +} + +/* + * For all update functions, key should be a valid pointer because these + * update functions are called under blkcg_lock, that means, blkg is + * valid and in turn key is valid. queue exit path can not race becuase + * of blkcg_lock + * + * Can not take queue lock in update functions as queue lock under blkcg_lock + * is not allowed. Under other paths we take blkcg_lock under queue_lock. + */ +static void throtl_update_blkio_group_read_bps(void *key, + struct blkio_group *blkg, u64 read_bps) +{ + struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); + + tg->bps[READ] = read_bps; + throtl_update_blkio_group_common(td, tg); +} + +static void throtl_update_blkio_group_write_bps(void *key, + struct blkio_group *blkg, u64 write_bps) +{ + struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); + + tg->bps[WRITE] = write_bps; + throtl_update_blkio_group_common(td, tg); +} + +static void throtl_update_blkio_group_read_iops(void *key, + struct blkio_group *blkg, unsigned int read_iops) +{ + struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); + + tg->iops[READ] = read_iops; + throtl_update_blkio_group_common(td, tg); +} + +static void throtl_update_blkio_group_write_iops(void *key, + struct blkio_group *blkg, unsigned int write_iops) +{ + struct throtl_data *td = key; + struct throtl_grp *tg = tg_of_blkg(blkg); + + tg->iops[WRITE] = write_iops; + throtl_update_blkio_group_common(td, tg); +} + +static void throtl_shutdown_wq(struct request_queue *q) +{ + struct throtl_data *td = q->td; + + cancel_delayed_work_sync(&td->throtl_work); +} + +static struct blkio_policy_type blkio_policy_throtl = { + .ops = { + .blkio_unlink_group_fn = throtl_unlink_blkio_group, + .blkio_update_group_read_bps_fn = + throtl_update_blkio_group_read_bps, + .blkio_update_group_write_bps_fn = + throtl_update_blkio_group_write_bps, + .blkio_update_group_read_iops_fn = + throtl_update_blkio_group_read_iops, + .blkio_update_group_write_iops_fn = + throtl_update_blkio_group_write_iops, + }, + .plid = BLKIO_POLICY_THROTL, +}; + +bool blk_throtl_bio(struct request_queue *q, struct bio *bio) +{ + struct throtl_data *td = q->td; + struct throtl_grp *tg; + bool rw = bio_data_dir(bio), update_disptime = true; + struct blkio_cgroup *blkcg; + bool throttled = false; + + if (bio_rw_flagged(bio, BIO_RW_THROTTLED)) { + bio->bi_rw &= ~(1 << BIO_RW_THROTTLED); + goto out; + } + + /* + * A throtl_grp pointer retrieved under rcu can be used to access + * basic fields like stats and io rates. If a group has no rules, + * just update the dispatch stats in lockless manner and return. + */ + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + tg = throtl_find_tg(td, blkcg); + if (tg) { + throtl_tg_fill_dev_details(td, tg); + + if (tg_no_rule_group(tg, rw)) { + blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size, + rw, bio->bi_rw & REQ_SYNC); + rcu_read_unlock(); + goto out; + } + } + rcu_read_unlock(); + + /* + * Either group has not been allocated yet or it is not an unlimited + * IO group + */ + spin_lock_irq(q->queue_lock); + tg = throtl_get_tg(td); + if (unlikely(!tg)) + goto out_unlock; + + if (tg->nr_queued[rw]) { + /* + * There is already another bio queued in same dir. No + * need to update dispatch time. + */ + update_disptime = false; + goto queue_bio; + + } + + /* Bio is with-in rate limit of group */ + if (tg_may_dispatch(td, tg, bio, NULL)) { + throtl_charge_bio(tg, bio); + + /* + * We need to trim slice even when bios are not being queued + * otherwise it might happen that a bio is not queued for + * a long time and slice keeps on extending and trim is not + * called for a long time. Now if limits are reduced suddenly + * we take into account all the IO dispatched so far at new + * low rate and * newly queued IO gets a really long dispatch + * time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(td, tg, rw); + goto out_unlock; + } + +queue_bio: + throtl_log_tg(td, tg, "[%c] bio. bdisp=%u sz=%u bps=%llu" + " iodisp=%u iops=%u queued=%d/%d", + rw == READ ? 'R' : 'W', + tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], + tg->io_disp[rw], tg->iops[rw], + tg->nr_queued[READ], tg->nr_queued[WRITE]); + + throtl_add_bio_tg(q->td, tg, bio); + throttled = true; + + if (update_disptime) { + tg_update_disptime(td, tg); + throtl_schedule_next_dispatch(td); + } + +out_unlock: + spin_unlock_irq(q->queue_lock); +out: + return throttled; +} + +/** + * blk_throtl_drain - drain throttled bios + * @q: request_queue to drain throttled bios for + * + * Dispatch all currently throttled bios on @q through ->make_request_fn(). + */ +void blk_throtl_drain(struct request_queue *q) + __releases(q->queue_lock) __acquires(q->queue_lock) +{ + struct throtl_data *td = q->td; + struct throtl_rb_root *st = &td->tg_service_tree; + struct throtl_grp *tg; + struct bio_list bl; + struct bio *bio; + + WARN_ON_ONCE(!queue_is_locked(q)); + + bio_list_init(&bl); + + while ((tg = throtl_rb_first(st))) { + throtl_dequeue_tg(td, tg); + + while ((bio = bio_list_peek(&tg->bio_lists[READ]))) + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); + while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) + tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); + } + spin_unlock_irq(q->queue_lock); + + while ((bio = bio_list_pop(&bl))) + generic_make_request(bio); + + spin_lock_irq(q->queue_lock); +} + +int blk_throtl_init(struct request_queue *q) +{ + struct throtl_data *td; + struct throtl_grp *tg; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + + INIT_HLIST_HEAD(&td->tg_list); + td->tg_service_tree = THROTL_RB_ROOT; + td->limits_changed = false; + INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); + + /* alloc and Init root group. */ + td->queue = q; + tg = throtl_alloc_tg(td); + + if (!tg) { + kfree(td); + return -ENOMEM; + } + + td->root_tg = tg; + + rcu_read_lock(); + throtl_init_add_tg_lists(td, tg, &blkio_root_cgroup); + rcu_read_unlock(); + + /* Attach throtl data to request queue */ + q->td = td; + return 0; +} + +void blk_throtl_exit(struct request_queue *q) +{ + struct throtl_data *td = q->td; + bool wait = false; + + BUG_ON(!td); + + throtl_shutdown_wq(q); + + spin_lock_irq(q->queue_lock); + throtl_release_tgs(td); + + /* If there are other groups */ + if (td->nr_undestroyed_grps > 0) + wait = true; + + spin_unlock_irq(q->queue_lock); + + /* + * Wait for tg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other undestroyed groups out + * there (other than root group). This can happen if cgroup deletion + * path claimed the responsibility of cleaning up a group before + * queue cleanup code get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + + /* + * Just being safe to make sure after previous flush if some body did + * update limits through cgroup and another work got queued, cancel + * it. + */ + throtl_shutdown_wq(q); +} + +void blk_throtl_release(struct request_queue *q) +{ + kfree(q->td); +} + +static int __init throtl_init(void) +{ + kthrotld_workqueue = create_workqueue("kthrotld"); + if (!kthrotld_workqueue) + panic("Failed to create kthrotld\n"); + + blkio_policy_register(&blkio_policy_throtl); + return 0; +} + +module_init(throtl_init); diff -uprN linux-2.6.32/block/blk-timeout.c linux-2.6.32.ovz/block/blk-timeout.c --- linux-2.6.32/block/blk-timeout.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/blk-timeout.c 2015-03-10 13:24:04.000000000 -0700 @@ -87,8 +87,8 @@ static void blk_rq_timed_out(struct requ __blk_complete_request(req); break; case BLK_EH_RESET_TIMER: - blk_clear_rq_complete(req); blk_add_timer(req); + blk_clear_rq_complete(req); break; case BLK_EH_NOT_HANDLED: /* @@ -109,6 +109,7 @@ void blk_rq_timed_out_timer(unsigned lon struct request_queue *q = (struct request_queue *) data; unsigned long flags, next = 0; struct request *rq, *tmp; + int next_set = 0; spin_lock_irqsave(q->queue_lock, flags); @@ -122,16 +123,13 @@ void blk_rq_timed_out_timer(unsigned lon if (blk_mark_rq_complete(rq)) continue; blk_rq_timed_out(rq); - } else if (!next || time_after(next, rq->deadline)) + } else if (!next_set || time_after(next, rq->deadline)) { next = rq->deadline; + next_set = 1; + } } - /* - * next can never be 0 here with the list non-empty, since we always - * bump ->deadline to 1 so we can detect if the timer was ever added - * or not. See comment in blk_add_timer() - */ - if (next) + if (next_set) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); @@ -172,7 +170,6 @@ void blk_add_timer(struct request *req) return; BUG_ON(!list_empty(&req->timeout_list)); - BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); /* * Some LLDs, like scsi, peek at the timeout to prevent a diff -uprN linux-2.6.32/block/bsg.c linux-2.6.32.ovz/block/bsg.c --- linux-2.6.32/block/bsg.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/bsg.c 2015-03-10 13:24:13.000000000 -0700 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -186,7 +187,7 @@ static int blk_fill_sgv4_hdr_rq(struct r return -EFAULT; if (hdr->subprotocol == BSG_SUB_PROTOCOL_SCSI_CMD) { - if (blk_verify_command(rq->cmd, has_write_perm)) + if (blk_verify_command(q, rq->cmd, has_write_perm)) return -EPERM; } else if (!capable(CAP_SYS_RAWIO)) return -EPERM; @@ -197,7 +198,7 @@ static int blk_fill_sgv4_hdr_rq(struct r rq->cmd_len = hdr->request_len; rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->timeout = (hdr->timeout * HZ) / 1000; + rq->timeout = msecs_to_jiffies(hdr->timeout); if (!rq->timeout) rq->timeout = q->sg_timeout; if (!rq->timeout) @@ -249,6 +250,14 @@ bsg_map_hdr(struct bsg_device *bd, struc int ret, rw; unsigned int dxfer_len; void *dxferp = NULL; + struct bsg_class_device *bcd = &q->bsg_dev; + + /* if the LLD has been removed then the bsg_unregister_queue will + * eventually be called and the class_dev was freed, so we can no + * longer use this request_queue. Return no such address. + */ + if (!bcd->class_dev) + return ERR_PTR(-ENXIO); dprintk("map hdr %llx/%u %llx/%u\n", (unsigned long long) hdr->dout_xferp, hdr->dout_xfer_len, (unsigned long long) hdr->din_xferp, @@ -263,7 +272,7 @@ bsg_map_hdr(struct bsg_device *bd, struc */ rq = blk_get_request(q, rw, GFP_KERNEL); if (!rq) - return ERR_PTR(-ENOMEM); + return ERR_PTR(-ENODEV); ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm); if (ret) goto out; @@ -276,7 +285,7 @@ bsg_map_hdr(struct bsg_device *bd, struc next_rq = blk_get_request(q, READ, GFP_KERNEL); if (!next_rq) { - ret = -ENOMEM; + ret = -ENODEV; goto out; } rq->next_rq = next_rq; @@ -760,12 +769,10 @@ static struct bsg_device *bsg_add_device struct file *file) { struct bsg_device *bd; - int ret; #ifdef BSG_DEBUG unsigned char buf[32]; #endif - ret = blk_get_queue(rq); - if (ret) + if (!blk_get_request_queue(rq)) return ERR_PTR(-ENXIO); bd = bsg_alloc_device(); diff -uprN linux-2.6.32/block/bsg-lib.c linux-2.6.32.ovz/block/bsg-lib.c --- linux-2.6.32/block/bsg-lib.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/block/bsg-lib.c 2015-03-10 13:22:36.000000000 -0700 @@ -0,0 +1,296 @@ +/* + * BSG helper library + * + * Copyright (C) 2008 James Smart, Emulex Corporation + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * Copyright (C) 2011 Mike Christie + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#include +#include +#include +#include +#include +#include + +/** + * bsg_destroy_job - routine to teardown/delete a bsg job + * @job: bsg_job that is to be torn down + */ +static void bsg_destroy_job(struct bsg_job *job) +{ + put_device(job->dev); /* release reference for the request */ + + kfree(job->request_payload.sg_list); + kfree(job->reply_payload.sg_list); + kfree(job); +} + +/** + * bsg_job_done - completion routine for bsg requests + * @job: bsg_job that is complete + * @result: job reply result + * @reply_payload_rcv_len: length of payload recvd + * + * The LLD should call this when the bsg job has completed. + */ +void bsg_job_done(struct bsg_job *job, int result, + unsigned int reply_payload_rcv_len) +{ + struct request *req = job->req; + struct request *rsp = req->next_rq; + int err; + + err = job->req->errors = result; + if (err < 0) + /* we're only returning the result field in the reply */ + job->req->sense_len = sizeof(u32); + else + job->req->sense_len = job->reply_len; + /* we assume all request payload was transferred, residual == 0 */ + req->resid_len = 0; + + if (rsp) { + WARN_ON(reply_payload_rcv_len > rsp->resid_len); + + /* set reply (bidi) residual */ + rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len); + } + blk_complete_request(req); +} +EXPORT_SYMBOL_GPL(bsg_job_done); + +/** + * bsg_softirq_done - softirq done routine for destroying the bsg requests + * @rq: BSG request that holds the job to be destroyed + */ +static void bsg_softirq_done(struct request *rq) +{ + struct bsg_job *job = rq->special; + + blk_end_request_all(rq, rq->errors); + bsg_destroy_job(job); +} + +static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) +{ + size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments); + + BUG_ON(!req->nr_phys_segments); + + buf->sg_list = kzalloc(sz, GFP_KERNEL); + if (!buf->sg_list) + return -ENOMEM; + sg_init_table(buf->sg_list, req->nr_phys_segments); + buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->payload_len = blk_rq_bytes(req); + return 0; +} + +/** + * bsg_create_job - create the bsg_job structure for the bsg request + * @dev: device that is being sent the bsg request + * @req: BSG request that needs a job structure + * @lld_bsg_job_size: size of lld space + */ +static int bsg_create_job(struct device *dev, struct request *req, + int lld_bsg_job_size) +{ + struct request *rsp = req->next_rq; + struct bsg_job *job; + int ret; + + BUG_ON(req->special); + + job = kzalloc(sizeof(struct bsg_job) + lld_bsg_job_size, GFP_KERNEL); + if (!job) + return -ENOMEM; + + req->special = job; + job->req = req; + if (lld_bsg_job_size) + job->dd_data = (void *)&job[1]; + job->request = req->cmd; + job->request_len = req->cmd_len; + job->reply = req->sense; + job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer + * allocated */ + if (req->bio) { + ret = bsg_map_buffer(&job->request_payload, req); + if (ret) + goto failjob_rls_job; + } + if (rsp && rsp->bio) { + ret = bsg_map_buffer(&job->reply_payload, rsp); + if (ret) + goto failjob_rls_rqst_payload; + } + job->dev = dev; + /* take a reference for the request */ + get_device(job->dev); + return 0; + +failjob_rls_rqst_payload: + kfree(job->request_payload.sg_list); +failjob_rls_job: + kfree(job); + return -ENOMEM; +} + +/* + * bsg_goose_queue - restart queue in case it was stopped + * @q: request q to be restarted + */ +void bsg_goose_queue(struct request_queue *q) +{ + if (!q) + return; + + blk_run_queue_async(q); +} +EXPORT_SYMBOL_GPL(bsg_goose_queue); + +/** + * bsg_request_fn - generic handler for bsg requests + * @q: request queue to manage + * @bsg_job_fn: function to process job. + * @lld_bsg_job_size: size of lld space + * + * On error the create_bsg_job function should return a -Exyz error value + * that will be set to the req->errors. + * + * Drivers/subsys should pass this to the queue init function. + */ +void bsg_request_fn(struct request_queue *q, bsg_job_fn *bsg_job_fn, + int lld_bsg_job_size) +{ + struct device *dev = q->queuedata; + struct request *req; + struct bsg_job *job; + int ret; + + if (!get_device(dev)) + return; + + while (1) { + req = blk_fetch_request(q); + if (!req) + break; + spin_unlock_irq(q->queue_lock); + + ret = bsg_create_job(dev, req, lld_bsg_job_size); + if (ret) { + req->errors = ret; + blk_end_request_all(req, ret); + spin_lock_irq(q->queue_lock); + continue; + } + + job = req->special; + ret = bsg_job_fn(job); + spin_lock_irq(q->queue_lock); + if (ret) + break; + } + + spin_unlock_irq(q->queue_lock); + put_device(dev); + spin_lock_irq(q->queue_lock); +} +EXPORT_SYMBOL_GPL(bsg_request_fn); + +/** + * bsg_setup_queue - Create and add the bsg hooks so we can receive requests + * @dev: device to attach bsg device to + * @q: request queue setup by caller + * @name: device to give bsg device + * + * The caller should have setup the reuqest queue with bsg_request_fn + * as the request_fn. + */ +int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name) +{ + int ret; + + q->queuedata = dev; + queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q); + blk_queue_softirq_done(q, bsg_softirq_done); + blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); + + ret = bsg_register_queue(q, dev, name, NULL); + if (ret) { + printk(KERN_ERR "%s: bsg interface failed to " + "initialize - register queue\n", dev->kobj.name); + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(bsg_setup_queue); + +/** + * bsg_remove_queue - Deletes the bsg dev from the q + * @q: the request_queue that is to be torn down. + * + * Notes: + * Before unregistering the queue empty any requests that are blocked + */ +void bsg_remove_queue(struct request_queue *q) +{ + struct request *req; /* block request */ + int counts; /* totals for request_list count and starved */ + + if (!q) + return; + + /* Stop taking in new requests */ + spin_lock_irq(q->queue_lock); + blk_stop_queue(q); + + /* drain all requests in the queue */ + while (1) { + /* need the lock to fetch a request + * this may fetch the same reqeust as the previous pass + */ + req = blk_fetch_request(q); + /* save requests in use and starved */ + counts = q->rq.count[0] + q->rq.count[1] + + q->rq.starved[0] + q->rq.starved[1]; + spin_unlock_irq(q->queue_lock); + /* any requests still outstanding? */ + if (counts == 0) + break; + + /* This may be the same req as the previous iteration, + * always send the blk_end_request_all after a prefetch. + * It is not okay to not end the request because the + * prefetch started the request. + */ + if (req) { + /* return -ENXIO to indicate that this queue is + * going away + */ + req->errors = -ENXIO; + blk_end_request_all(req, -ENXIO); + } + + msleep(200); /* allow bsg to possibly finish */ + spin_lock_irq(q->queue_lock); + } + bsg_unregister_queue(q); +} +EXPORT_SYMBOL_GPL(bsg_remove_queue); diff -uprN linux-2.6.32/block/cfq.h linux-2.6.32.ovz/block/cfq.h --- linux-2.6.32/block/cfq.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/block/cfq.h 2015-03-10 13:21:50.000000000 -0700 @@ -0,0 +1,115 @@ +#ifndef _CFQ_H +#define _CFQ_H +#include "blk-cgroup.h" + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) +{ + blkiocg_update_io_add_stats(blkg, curr_blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) +{ + blkiocg_update_dequeue_stats(blkg, dequeue); +} + +static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) +{ + blkiocg_update_timeslice_used(blkg, time); +} + +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) +{ + blkiocg_set_start_empty_time(blkg); +} + +static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + blkiocg_update_io_remove_stats(blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) +{ + blkiocg_update_io_merged_stats(blkg, direction, sync); +} + +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ + blkiocg_update_idle_time_stats(blkg); +} + +static inline void +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) +{ + blkiocg_update_avg_queue_size_stats(blkg); +} + +static inline void +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) +{ + blkiocg_update_set_idle_time_stats(blkg); +} + +static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) +{ + blkiocg_update_dispatch_stats(blkg, bytes, direction, sync); +} + +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) +{ + blkiocg_update_completion_stats(blkg, start_time, io_start_time, + direction, sync); +} + +static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) { + blkiocg_add_blkio_group(blkcg, blkg, key, dev, BLKIO_POLICY_PROP); +} + +static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + return blkiocg_del_blkio_group(blkg); +} + +#else /* CFQ_GROUP_IOSCHED */ +static inline void cfq_blkiocg_update_io_add_stats(struct blkio_group *blkg, + struct blkio_group *curr_blkg, bool direction, bool sync) {} + +static inline void cfq_blkiocg_update_dequeue_stats(struct blkio_group *blkg, + unsigned long dequeue) {} + +static inline void cfq_blkiocg_update_timeslice_used(struct blkio_group *blkg, + unsigned long time) {} +static inline void cfq_blkiocg_set_start_empty_time(struct blkio_group *blkg) {} +static inline void cfq_blkiocg_update_io_remove_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void cfq_blkiocg_update_io_merged_stats(struct blkio_group *blkg, + bool direction, bool sync) {} +static inline void cfq_blkiocg_update_idle_time_stats(struct blkio_group *blkg) +{ +} +static inline void +cfq_blkiocg_update_avg_queue_size_stats(struct blkio_group *blkg) {} + +static inline void +cfq_blkiocg_update_set_idle_time_stats(struct blkio_group *blkg) {} + +static inline void cfq_blkiocg_update_dispatch_stats(struct blkio_group *blkg, + uint64_t bytes, bool direction, bool sync) {} +static inline void cfq_blkiocg_update_completion_stats(struct blkio_group *blkg, uint64_t start_time, uint64_t io_start_time, bool direction, bool sync) {} + +static inline void cfq_blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, + struct blkio_group *blkg, void *key, dev_t dev) {} +static inline int cfq_blkiocg_del_blkio_group(struct blkio_group *blkg) +{ + return 0; +} + +#endif /* CFQ_GROUP_IOSCHED */ +#endif diff -uprN linux-2.6.32/block/cfq-iosched.c linux-2.6.32.ovz/block/cfq-iosched.c --- linux-2.6.32/block/cfq-iosched.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/cfq-iosched.c 2015-06-23 04:16:16.000000000 -0700 @@ -9,15 +9,19 @@ #include #include #include +#include #include #include #include +#include +#include +#include "cfq.h" /* * tunables */ /* max queue in one round of service */ -static const int cfq_quantum = 4; +static const int cfq_quantum = 8; static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; /* maximum backwards seek, in KiB */ static const int cfq_back_max = 16 * 1024; @@ -27,6 +31,10 @@ static const int cfq_slice_sync = HZ / 1 static int cfq_slice_async = HZ / 25; static const int cfq_slice_async_rq = 2; static int cfq_slice_idle = HZ / 125; +static int cfq_group_idle = HZ / 125; +static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ +static const int cfq_hist_divisor = 4; +static int cfq_fast_slow_expiration_rate = 10; /* 10 seconds */ /* * offset from end of service tree @@ -40,10 +48,23 @@ static int cfq_slice_idle = HZ / 125; #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) +#define CFQ_SERVICE_SHIFT 12 + +#define CFQQ_SEEK_THR (sector_t)(8 * 100) +#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) +#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) + +#define CFQD_IDLE_AUTODETECT(cfqd) \ + ((cfqd)->hw_tag == 1 && !(cfqd)->cfq_enable_idle_for_deep) +#define CFQD_DISK_LOOKS_FAST(cfqd) \ + (CFQD_IDLE_AUTODETECT(cfqd) && cfqd->cfq_disk_looks_fast > cfqd->cfq_disk_looks_slow) +#define CFQQ_DEEP_THR 4 #define RQ_CIC(rq) \ - ((struct cfq_io_context *) (rq)->elevator_private) -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) + ((struct cfq_io_context *) (rq)->elevator_private[0]) +#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private[1]) +#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private[2]) static struct kmem_cache *cfq_pool; static struct kmem_cache *cfq_ioc_pool; @@ -57,6 +78,7 @@ static DEFINE_SPINLOCK(ioc_gone_lock); #define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) #define sample_valid(samples) ((samples) > 80) +#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) /* * Most of our rbtree usage is for sorting with min extraction, so @@ -67,15 +89,20 @@ static DEFINE_SPINLOCK(ioc_gone_lock); struct cfq_rb_root { struct rb_root rb; struct rb_node *left; + unsigned count; + unsigned total_weight; + u64 min_vdisktime; + struct rb_node *active; }; -#define CFQ_RB_ROOT (struct cfq_rb_root) { RB_ROOT, NULL, } +#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \ + .count = 0, .min_vdisktime = 0, } /* * Per process-grouping structure */ struct cfq_queue { /* reference count */ - atomic_t ref; + int ref; /* various state flags, see below */ unsigned int flags; /* parent cfq_data */ @@ -99,9 +126,14 @@ struct cfq_queue { /* fifo list of requests in sort_list */ struct list_head fifo; + /* time when queue got scheduled in to dispatch first request. */ + unsigned long dispatch_start; + unsigned int allocated_slice; + unsigned int slice_dispatch; + /* time when first request from queue completed and slice started. */ + unsigned long slice_start; unsigned long slice_end; long slice_resid; - unsigned int slice_dispatch; /* pending metadata requests */ int meta_pending; @@ -113,6 +145,75 @@ struct cfq_queue { unsigned short ioprio_class, org_ioprio_class; pid_t pid; + + u32 seek_history; + sector_t last_request_pos; + + struct cfq_rb_root *service_tree; + struct cfq_queue *new_cfqq; + struct cfq_group *cfqg; + struct cfq_group *orig_cfqg; + /* Number of sectors dispatched from queue in single dispatch round */ + unsigned long nr_sectors; + + /* When fisrst dispatch in dispatch round happened */ + unsigned long first_dispatch; + /* Number of dispatch happened since first dispatch + 1 */ + int n_dispatched; +}; + +/* + * First index in the service_trees. + * IDLE is handled separately, so it has negative index + */ +enum wl_prio_t { + BE_WORKLOAD = 0, + RT_WORKLOAD = 1, + IDLE_WORKLOAD = 2, +}; + +/* + * Second index in the service_trees. + */ +enum wl_type_t { + ASYNC_WORKLOAD = 0, + SYNC_NOIDLE_WORKLOAD = 1, + SYNC_WORKLOAD = 2 +}; + +/* This is per cgroup per device grouping structure */ +struct cfq_group { + /* group service_tree member */ + struct rb_node rb_node; + + /* group service_tree key */ + u64 vdisktime; + unsigned int weight; + unsigned int new_weight; + bool needs_update; + + /* number of cfqq currently on this group */ + int nr_cfqq; + + /* Per group busy queus average. Useful for workload slice calc. */ + unsigned int busy_queues_avg[2]; + /* + * rr lists of queues with requests, onle rr for each priority class. + * Counts are embedded in the cfq_rb_root + */ + struct cfq_rb_root service_trees[2][3]; + struct cfq_rb_root service_tree_idle; + + unsigned long saved_workload_slice; + enum wl_type_t saved_workload; + enum wl_prio_t saved_serving_prio; + struct blkio_group blkg; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + struct hlist_node cfqd_node; + int ref; +#endif + /* number of requests that are on the dispatch list or inside driver */ + int dispatched; }; /* @@ -120,11 +221,17 @@ struct cfq_queue { */ struct cfq_data { struct request_queue *queue; + /* Root service tree for cfq_groups */ + struct cfq_rb_root grp_service_tree; + struct cfq_group root_group; /* - * rr list of queues with requests and the count of them + * The priority currently being served */ - struct cfq_rb_root service_tree; + enum wl_prio_t serving_prio; + enum wl_type_t serving_type; + unsigned long workload_expires; + struct cfq_group *serving_group; /* * Each priority tree is sorted by next_request position. These @@ -135,16 +242,22 @@ struct cfq_data { unsigned int busy_queues; - int rq_in_driver[2]; - int sync_flight; + int rq_in_driver; + int rq_in_flight[2]; /* * queue-depth detection */ int rq_queued; int hw_tag; - int hw_tag_samples; - int rq_in_driver_peak; + /* + * hw_tag can be + * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) + * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) + * 0 => no NCQ + */ + int hw_tag_est_depth; + unsigned int hw_tag_samples; /* * idle window management @@ -173,7 +286,9 @@ struct cfq_data { unsigned int cfq_slice[2]; unsigned int cfq_slice_async_rq; unsigned int cfq_slice_idle; + unsigned int cfq_group_idle; unsigned int cfq_latency; + unsigned int cfq_group_isolation; struct list_head cic_list; @@ -182,9 +297,115 @@ struct cfq_data { */ struct cfq_queue oom_cfqq; - unsigned long last_end_sync_rq; + unsigned long last_delayed_sync; + + /* List of cfq groups being managed on this device*/ + struct hlist_head cfqg_list; + + /* Number of groups which are on blkcg->blkg_list */ + unsigned int nr_blkcg_linked_grps; + + /* + * Revert to former behaviour ensuring fairness between + * seeky and seeky&deep tasks sacrificing overall + * performance. Also, disables fast/slow prediction. + */ + int cfq_enable_idle_for_deep; + + /* + * # times disk claimed as fast and slow correspondingly + */ + int cfq_disk_looks_fast; + int cfq_disk_looks_slow; + + /* + * when fast/slow fields were updated last time + */ + unsigned long cfq_disk_last_updated; + + /* + * If no events for fast/slow prediction happen in this + * time-frame (measured in seconds), fast/slow counters + * are divided by two. + * + * Zero or negative value turns expiration off. + */ + int cfq_fast_slow_expiration_rate; + + /* + * Sum of vectors: + * service_trees[0][0].count, ..., cfqg->service_tree_idle.count> + * for all cfqg-s + */ + unsigned st_counts[sizeof(((struct cfq_group *)NULL)->service_trees) / + sizeof(((struct cfq_group *)NULL)->service_trees[0][0]) + 1]; + + /* average */ + unsigned long cfq_avg_queued[3]; + unsigned long cfq_avg_indriver[3]; + + /* when (in jiffies) to update averages next time */ + unsigned long cfq_calc_load_update; + + /* last values seen */ + int cfq_queued_last; + int cfq_indriver_last; }; +#define EXP_ARR_SIZ 12 +/* + * EXP_ARR[i][j] == ((EXP_I / FIXED_1) ^ (2^j)) * FIXED_1 + * where I == 1 for i=0, I == 5 for i=1, I == 15 for i=2 + */ +const u16 EXP_ARR[3][EXP_ARR_SIZ] = { + { EXP_1, 1733, 1466, 1050, 539, 142, 10 }, + { EXP_5, 1981, 1915, 1791, 1567, 1199, 701, 240, 28 }, + { EXP_15, 2026, 2004, 1962, 1879, 1723, 1451, 1028, 516, 130, 8 } +}; + +/* + * update averages every 5sec/60 + */ +#define CFQD_LOAD_FREQ max(LOAD_FREQ / 60, 1) + +/* + * Assumption: + * cfqq->service_tree always points to cfqq->cfqg->service_tree[i][j] + * for some 'i' and 'j'. Exception are service_tree_idle and + * required according isolation logic in cfq_service_tree_add + */ +static inline int cfq_get_idx_by_st(struct cfq_queue *cfqq, struct cfq_group *cfqg) +{ + int max_idx = sizeof(((struct cfq_data *)NULL)->st_counts) / + sizeof(((struct cfq_data *)NULL)->st_counts[0]) - 1; + int idx; + + if (unlikely(cfqq->service_tree == &cfqg->service_tree_idle)) + return max_idx; + + idx = cfqq->service_tree - &cfqg->service_trees[0][0]; + BUG_ON(idx < 0); + BUG_ON(idx >= max_idx); + return idx; +} + +static void cfq_update_stats(struct cfq_data *cfqd, int indrv_delta); + +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); + +static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg, + enum wl_prio_t prio, + enum wl_type_t type) +{ + if (!cfqg) + return NULL; + + if (prio == IDLE_WORKLOAD) + return &cfqg->service_tree_idle; + + return &cfqg->service_trees[prio][type]; +} + enum cfqq_state_flags { CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ @@ -195,8 +416,10 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ - CFQ_CFQQ_FLAG_coop, /* has done a coop jump of the queue */ - CFQ_CFQQ_FLAG_coop_preempt, /* coop preempt */ + CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ + CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ + CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ + CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ }; #define CFQ_CFQQ_FNS(name) \ @@ -223,25 +446,99 @@ CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); -CFQ_CFQQ_FNS(coop_preempt); +CFQ_CFQQ_FNS(split_coop); +CFQ_CFQQ_FNS(deep); +CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS +#ifdef CONFIG_CFQ_GROUP_IOSCHED +#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \ + cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ + blkg_path(&(cfqq)->cfqg->blkg), ##args); + +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \ + blk_add_trace_msg((cfqd)->queue, "%s " fmt, \ + blkg_path(&(cfqg)->blkg), ##args); \ + +#else #define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args) +#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0); +#endif #define cfq_log(cfqd, fmt, args...) \ blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) +/* Traverses through cfq group service trees */ +#define for_each_cfqg_st(cfqg, i, j, st) \ + for (i = 0; i <= IDLE_WORKLOAD; i++) \ + for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ + : &cfqg->service_tree_idle; \ + (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ + (i == IDLE_WORKLOAD && j == 0); \ + j++, st = i < IDLE_WORKLOAD ? \ + &cfqg->service_trees[i][j]: NULL) \ + + +static inline bool iops_mode(struct cfq_data *cfqd) +{ + /* + * If we are not idling on queues and it is a NCQ drive, parallel + * execution of requests is on and measuring time is not possible + * in most of the cases until and unless we drive shallower queue + * depths and that becomes a performance bottleneck. In such cases + * switch to start providing fairness in terms of number of IOs. + */ + if (!cfqd->cfq_slice_idle && cfqd->hw_tag) + return true; + else + return false; +} + +static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq) +{ + if (cfq_class_idle(cfqq)) + return IDLE_WORKLOAD; + if (cfq_class_rt(cfqq)) + return RT_WORKLOAD; + return BE_WORKLOAD; +} + + +static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) +{ + if (!cfq_cfqq_sync(cfqq)) + return ASYNC_WORKLOAD; + if (!cfq_cfqq_idle_window(cfqq)) + return SYNC_NOIDLE_WORKLOAD; + return SYNC_WORKLOAD; +} + +static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl, + struct cfq_data *cfqd, + struct cfq_group *cfqg) +{ + if (wl == IDLE_WORKLOAD) + return cfqg->service_tree_idle.count; + + return cfqg->service_trees[wl][ASYNC_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count + + cfqg->service_trees[wl][SYNC_WORKLOAD].count; +} + +static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg) +{ + return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + + cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; +} + static void cfq_dispatch_insert(struct request_queue *, struct request *); static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool, struct io_context *, gfp_t); static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *, struct io_context *); -static inline int rq_in_driver(struct cfq_data *cfqd) -{ - return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1]; -} - static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic, bool is_sync) { @@ -279,7 +576,7 @@ static int cfq_queue_empty(struct reques { struct cfq_data *cfqd = q->elevator->elevator_data; - return !cfqd->busy_queues; + return !cfqd->rq_queued; } /* @@ -303,10 +600,130 @@ cfq_prio_to_slice(struct cfq_data *cfqd, return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); } +static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg) +{ + u64 d = delta << CFQ_SERVICE_SHIFT; + + d = d * BLKIO_WEIGHT_DEFAULT; + do_div(d, cfqg->weight); + return d; +} + +static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta > 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime) +{ + s64 delta = (s64)(vdisktime - min_vdisktime); + if (delta < 0) + min_vdisktime = vdisktime; + + return min_vdisktime; +} + +static void update_min_vdisktime(struct cfq_rb_root *st) +{ + u64 vdisktime = st->min_vdisktime; + struct cfq_group *cfqg; + + if (st->active) { + cfqg = rb_entry_cfqg(st->active); + vdisktime = cfqg->vdisktime; + } + + if (st->left) { + cfqg = rb_entry_cfqg(st->left); + vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime); + } + + st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime); +} + +/* + * get averaged number of queues of RT/BE priority. + * average is updated, with a formula that gives more weight to higher numbers, + * to quickly follows sudden increases and decrease slowly + */ + +static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, + struct cfq_group *cfqg, bool rt) +{ + unsigned min_q, max_q; + unsigned mult = cfq_hist_divisor - 1; + unsigned round = cfq_hist_divisor / 2; + unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); + + min_q = min(cfqg->busy_queues_avg[rt], busy); + max_q = max(cfqg->busy_queues_avg[rt], busy); + cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / + cfq_hist_divisor; + return cfqg->busy_queues_avg[rt]; +} + +static inline u64 +cfq_group_vslice(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + u64 vslice; + + /* FIXME no group slices in iops mode? */ + if (iops_mode(cfqd)) + return 0; + + /* + * Equal to cfq_scale_slice(cfq_group_slice(cfqd, cfqg), cfqg). + * Add group weight beacuse it currently not in service tree. + */ + vslice = (u64)cfq_target_latency << CFQ_SERVICE_SHIFT; + vslice *= BLKIO_WEIGHT_DEFAULT; + do_div(vslice, st->total_weight + cfqg->weight); + return vslice; +} + +static inline unsigned +cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + return cfq_target_latency * cfqg->weight / st->total_weight; +} + static inline void cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - cfqq->slice_end = cfq_prio_to_slice(cfqd, cfqq) + jiffies; + unsigned slice = cfq_prio_to_slice(cfqd, cfqq); + if (cfqd->cfq_latency) { + /* + * interested queues (we consider only the ones with the same + * priority class in the cfq group) + */ + unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, + cfq_class_rt(cfqq)); + unsigned sync_slice = cfqd->cfq_slice[1]; + unsigned expect_latency = sync_slice * iq; + unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg); + + if (expect_latency > group_slice) { + unsigned base_low_slice = 2 * cfqd->cfq_slice_idle; + /* scale low_slice according to IO priority + * and sync vs async */ + unsigned low_slice = + min(slice, base_low_slice * slice / sync_slice); + /* the adapted slice value is scaled to fit all iqs + * into the target latency */ + slice = max(slice * group_slice / expect_latency, + low_slice); + } + } + cfqq->slice_start = jiffies; + cfqq->slice_end = jiffies + slice; + cfqq->allocated_slice = slice; cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies); } @@ -331,9 +748,9 @@ static inline bool cfq_slice_used(struct * behind the head is penalized and only allowed to a certain extent. */ static struct request * -cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) +cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) { - sector_t last, s1, s2, d1 = 0, d2 = 0; + sector_t s1, s2, d1 = 0, d2 = 0; unsigned long back_max; #define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ #define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ @@ -348,16 +765,15 @@ cfq_choose_req(struct cfq_data *cfqd, st return rq1; else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) return rq2; - if (rq_is_meta(rq1) && !rq_is_meta(rq2)) + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) return rq1; - else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) + else if ((rq2->cmd_flags & REQ_META) && + !(rq1->cmd_flags & REQ_META)) return rq2; s1 = blk_rq_pos(rq1); s2 = blk_rq_pos(rq2); - last = cfqd->last_position; - /* * by definition, 1KiB is 2 sectors */ @@ -425,67 +841,542 @@ cfq_choose_req(struct cfq_data *cfqd, st */ static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) { + /* Service tree is empty */ + if (!root->count) + return NULL; + if (!root->left) root->left = rb_first(&root->rb); if (root->left) return rb_entry(root->left, struct cfq_queue, rb_node); - return NULL; + return NULL; +} + +static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) +{ + if (!root->left) + root->left = rb_first(&root->rb); + + if (root->left) + return rb_entry_cfqg(root->left); + + return NULL; +} + +static void rb_erase_init(struct rb_node *n, struct rb_root *root) +{ + rb_erase(n, root); + RB_CLEAR_NODE(n); +} + +static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) +{ + if (root->left == n) + root->left = NULL; + rb_erase_init(n, &root->rb); + --root->count; +} + +/* + * would be nice to take fifo expire time into account as well + */ +static struct request * +cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next = NULL, *prev = NULL; + + BUG_ON(RB_EMPTY_NODE(&last->rb_node)); + + if (rbprev) + prev = rb_entry_rq(rbprev); + + if (rbnext) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&cfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); +} + +static unsigned long cfq_slice_offset(struct cfq_data *cfqd, + struct cfq_queue *cfqq) +{ + /* + * just an approximation, should be ok. + */ + return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - + cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); +} + +static inline s64 +cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + return cfqg->vdisktime - st->min_vdisktime; +} + +static void +__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + struct rb_node **node = &st->rb.rb_node; + struct rb_node *parent = NULL; + struct cfq_group *__cfqg; + s64 key = cfqg_key(st, cfqg); + int left = 1; + + while (*node != NULL) { + parent = *node; + __cfqg = rb_entry_cfqg(parent); + + if (key < cfqg_key(st, __cfqg)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + left = 0; + } + } + + if (left) + st->left = &cfqg->rb_node; + + rb_link_node(&cfqg->rb_node, parent, node); + rb_insert_color(&cfqg->rb_node, &st->rb); +} + +static void +cfq_update_group_weight(struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + if (cfqg->needs_update) { + cfqg->weight = cfqg->new_weight; + cfqg->needs_update = false; + } +} + +static void +cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); + + cfq_update_group_weight(cfqg); + __cfq_group_service_tree_add(st, cfqg); + st->total_weight += cfqg->weight; +} + +static void +cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *__cfqg; + struct rb_node *n; + + cfqg->nr_cfqq++; + if (!RB_EMPTY_NODE(&cfqg->rb_node)) + return; + + /* + * Bump vdisktime to be greater or equal min_vdisktime. + */ + cfqg->vdisktime = max_vdisktime(cfqg->vdisktime, st->min_vdisktime); + + /* + * Put the group at the end, but save one slice from unused time. + */ + n = rb_last(&st->rb); + if (n) { + __cfqg = rb_entry_cfqg(n); + cfqg->vdisktime = max_vdisktime(cfqg->vdisktime, + __cfqg->vdisktime - + cfq_group_vslice(cfqd, cfqg)); + } + cfq_group_service_tree_add(st, cfqg); +} + +static void +cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) +{ + st->total_weight -= cfqg->weight; + if (!RB_EMPTY_NODE(&cfqg->rb_node)) + cfq_rb_erase(&cfqg->rb_node, st); +} + +static void +cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + + if (st->active == &cfqg->rb_node) + st->active = NULL; + + BUG_ON(cfqg->nr_cfqq < 1); + cfqg->nr_cfqq--; + + /* If there are other cfq queues under this group, don't delete it */ + if (cfqg->nr_cfqq) + return; + + cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); + cfq_group_service_tree_del(st, cfqg); + cfqg->saved_workload_slice = 0; + cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1); +} + +static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq) +{ + unsigned int slice_used; + + /* + * Queue got expired before even a single request completed or + * got expired immediately after first request completion. + */ + if (!cfqq->slice_start || cfqq->slice_start == jiffies) { + /* + * Also charge the seek time incurred to the group, otherwise + * if there are mutiple queues in the group, each can dispatch + * a single request on seeky media and cause lots of seek time + * and group will never know it. + */ + slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start), + 1); + } else { + slice_used = jiffies - cfqq->slice_start; + if (slice_used > cfqq->allocated_slice) + slice_used = cfqq->allocated_slice; + } + + return slice_used; +} + +static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, + struct cfq_queue *cfqq) +{ + struct cfq_rb_root *st = &cfqd->grp_service_tree; + unsigned int used_sl, charge; + int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) + - cfqg->service_tree_idle.count; + + BUG_ON(nr_sync < 0); + used_sl = charge = cfq_cfqq_slice_usage(cfqq); + + if (iops_mode(cfqd)) + charge = cfqq->slice_dispatch; + else if (!cfq_cfqq_sync(cfqq) && !nr_sync) + charge = cfqq->allocated_slice; + + /* Can't update vdisktime while group is on service tree */ + cfq_group_service_tree_del(st, cfqg); + cfqg->vdisktime += cfq_scale_slice(charge, cfqg); + /* If a new weight was requested, update now, off tree */ + cfq_group_service_tree_add(st, cfqg); + + /* This group is being expired. Save the context */ + if (time_after(cfqd->workload_expires, jiffies)) { + cfqg->saved_workload_slice = cfqd->workload_expires + - jiffies; + cfqg->saved_workload = cfqd->serving_type; + cfqg->saved_serving_prio = cfqd->serving_prio; + } else + cfqg->saved_workload_slice = 0; + + cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, + st->min_vdisktime); + cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u" + " sect=%u", used_sl, cfqq->slice_dispatch, charge, + iops_mode(cfqd), cfqq->nr_sectors); + cfq_blkiocg_update_timeslice_used(&cfqg->blkg, used_sl); + cfq_blkiocg_set_start_empty_time(&cfqg->blkg); +} + +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg) +{ + if (blkg) + return container_of(blkg, struct cfq_group, blkg); + return NULL; +} + +void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg, + unsigned int weight) +{ + struct cfq_group *cfqg = cfqg_of_blkg(blkg); + cfqg->new_weight = weight; + cfqg->needs_update = true; +} + +static void cfq_init_add_cfqg_lists(struct cfq_data *cfqd, + struct cfq_group *cfqg, struct blkio_cgroup *blkcg) +{ + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + + /* + * Add group onto cgroup list. It might happen that bdi->dev is + * not initialized yet. Initialize this new group without major + * and minor info and this info will be filled in once a new thread + * comes for IO. + */ + if (bdi->dev) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, + (void *)cfqd, MKDEV(major, minor)); + } else + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, + (void *)cfqd, 0); + + cfqd->nr_blkcg_linked_grps++; + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev); + + /* Add group on cfqd list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); + if (!cfqg->blkg.dev_name && cfqd->queue->kobj.parent) + cfqg->blkg.dev_name = kstrdup(kobject_name( + cfqd->queue->kobj.parent), GFP_ATOMIC); +} + +/* + * Should be called from sleepable context. No request queue lock as per + * cpu stats are allocated dynamically and alloc_percpu needs to be called + * from sleepable context. + */ +static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg = NULL; + int i, j, ret; + struct cfq_rb_root *st; + + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node); + if (!cfqg) + return NULL; + + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* + * Take the initial reference that will be released on destroy + * This can be thought of a joint reference by cgroup and + * elevator which will be dropped by either elevator exit + * or cgroup deletion path depending on who is exiting first. + */ + cfqg->ref = 1; + + ret = blkio_alloc_blkg_stats(&cfqg->blkg); + if (ret) { + kfree(cfqg); + return NULL; + } + + return cfqg; +} + +static void cfq_free_cfqg(struct cfq_group *cfqg) +{ + blkio_free_blkg_stats(&cfqg->blkg); + kfree(cfqg->blkg.dev_name); + kfree(cfqg); +} + +static struct cfq_group * +cfq_find_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg) +{ + struct cfq_group *cfqg = NULL; + void *key = cfqd; + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info; + unsigned int major, minor; + + /* + * This is the common case when there are no blkio cgroups. + * Avoid lookup in this case + */ + if (blkcg == &blkio_root_cgroup) + cfqg = &cfqd->root_group; + else + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key)); + + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) { + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor); + cfqg->blkg.dev = MKDEV(major, minor); + } + + return cfqg; +} + +/* + * Search for the cfq group current task belongs to. request_queue lock must + * be held. + */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) +{ + struct blkio_cgroup *blkcg; + struct cfq_group *cfqg = NULL, *__cfqg = NULL; + struct request_queue *q = cfqd->queue; + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + cfqg = cfq_find_cfqg(cfqd, blkcg); + if (cfqg) { + rcu_read_unlock(); + return cfqg; + } + + /* + * Need to allocate a group. Allocation of group also needs allocation + * of per cpu stats which in-turn takes a mutex() and can block. Hence + * we need to drop rcu lock and queue_lock before we call alloc. + * + * Not taking any queue reference here and assuming that queue is + * around by the time we return. CFQ queue allocation code does + * the same. It might be racy though. + */ + + rcu_read_unlock(); + spin_unlock_irq(q->queue_lock); + + cfqg = cfq_alloc_cfqg(cfqd); + + spin_lock_irq(q->queue_lock); + + rcu_read_lock(); + blkcg = task_blkio_cgroup(current); + + /* + * If some other thread already allocated the group while we were + * not holding queue lock, free up the group + */ + __cfqg = cfq_find_cfqg(cfqd, blkcg); + + if (__cfqg) { + cfq_free_cfqg(cfqg); + rcu_read_unlock(); + return __cfqg; + } + + if (!cfqg) + cfqg = &cfqd->root_group; + + cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg); + rcu_read_unlock(); + return cfqg; +} + +static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) +{ + cfqg->ref++; + return cfqg; +} + +static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) +{ + /* Currently, all async queues are mapped to root group */ + if (!cfq_cfqq_sync(cfqq)) + cfqg = &cfqq->cfqd->root_group; + + cfqq->cfqg = cfqg; + /* cfqq reference on cfqg */ + cfqq->cfqg->ref++; +} + +static void cfq_put_cfqg(struct cfq_group *cfqg) +{ + struct cfq_rb_root *st; + int i, j; + + BUG_ON(cfqg->ref <= 0); + cfqg->ref--; + if (cfqg->ref) + return; + for_each_cfqg_st(cfqg, i, j, st) + BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL); + cfq_free_cfqg(cfqg); } -static void rb_erase_init(struct rb_node *n, struct rb_root *root) +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg) { - rb_erase(n, root); - RB_CLEAR_NODE(n); + /* Something wrong if we are trying to remove same group twice */ + BUG_ON(hlist_unhashed(&cfqg->cfqd_node)); + + hlist_del_init(&cfqg->cfqd_node); + + BUG_ON(cfqd->nr_blkcg_linked_grps <= 0); + cfqd->nr_blkcg_linked_grps--; + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + cfq_put_cfqg(cfqg); } -static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) +static void cfq_release_cfq_groups(struct cfq_data *cfqd) { - if (root->left == n) - root->left = NULL; - rb_erase_init(n, &root->rb); + struct hlist_node *pos, *n; + struct cfq_group *cfqg; + + hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) { + /* + * If cgroup removal path got to blk_group first and removed + * it from cgroup list, then it will take care of destroying + * cfqg also. + */ + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg)) + cfq_destroy_cfqg(cfqd, cfqg); + } } /* - * would be nice to take fifo expire time into account as well + * Blk cgroup controller notification saying that blkio_group object is being + * delinked as associated cgroup object is going away. That also means that + * no new IO will come in this group. So get rid of this group as soon as + * any pending IO in the group is finished. + * + * This function is called under rcu_read_lock(). key is the rcu protected + * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu + * read lock. + * + * "key" was fetched from blkio_group under blkio_cgroup->lock. That means + * it should not be NULL as even if elevator was exiting, cgroup deltion + * path got to it first. */ -static struct request * -cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *last) +void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg) { - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next = NULL, *prev = NULL; - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - - if (rbprev) - prev = rb_entry_rq(rbprev); + unsigned long flags; + struct cfq_data *cfqd = key; - if (rbnext) - next = rb_entry_rq(rbnext); - else { - rbnext = rb_first(&cfqq->sort_list); - if (rbnext && rbnext != &last->rb_node) - next = rb_entry_rq(rbnext); - } + spin_lock_irqsave(cfqd->queue->queue_lock, flags); + cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg)); + spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); +} - return cfq_choose_req(cfqd, next, prev); +#else /* GROUP_IOSCHED */ +static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd) +{ + return &cfqd->root_group; } -static unsigned long cfq_slice_offset(struct cfq_data *cfqd, - struct cfq_queue *cfqq) +static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg) { - /* - * just an approximation, should be ok. - */ - return (cfqd->busy_queues - 1) * (cfq_prio_slice(cfqd, 1, 0) - - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); + return NULL; +} + +static inline void +cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { + cfqq->cfqg = cfqg; } +static void cfq_release_cfq_groups(struct cfq_data *cfqd) {} +static inline void cfq_put_cfqg(struct cfq_group *cfqg) {} + +#endif /* GROUP_IOSCHED */ + /* - * The cfqd->service_tree holds all pending cfq_queue's that have + * The cfqd->service_trees holds all pending cfq_queue's that have * requests waiting to be processed. It is sorted in the order that * we will service the queues. */ @@ -495,11 +1386,43 @@ static void cfq_service_tree_add(struct struct rb_node **p, *parent; struct cfq_queue *__cfqq; unsigned long rb_key; + struct cfq_rb_root *service_tree; int left; + int new_cfqq = 1; + int group_changed = 0; + struct cfq_group *orig_group = cfqq->cfqg; + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfqd->cfq_group_isolation + && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD + && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) { + /* Move this cfq to root group */ + cfq_log_cfqq(cfqd, cfqq, "moving to root group"); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_group_notify_queue_del(cfqd, cfqq->cfqg); + cfqq->orig_cfqg = cfqq->cfqg; + cfqq->cfqg = &cfqd->root_group; + cfqd->root_group.ref++; + group_changed = 1; + } else if (!cfqd->cfq_group_isolation + && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) { + /* cfqq is sequential now needs to go to its original group */ + BUG_ON(cfqq->cfqg != &cfqd->root_group); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) + cfq_group_notify_queue_del(cfqd, cfqq->cfqg); + cfq_put_cfqg(cfqq->cfqg); + cfqq->cfqg = cfqq->orig_cfqg; + cfqq->orig_cfqg = NULL; + group_changed = 1; + cfq_log_cfqq(cfqd, cfqq, "moved to origin group"); + } +#endif + service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq), + cfqq_type(cfqq)); if (cfq_class_idle(cfqq)) { rb_key = CFQ_IDLE_DELAY; - parent = rb_last(&cfqd->service_tree.rb); + parent = rb_last(&service_tree->rb); if (parent && parent != &cfqq->rb_node) { __cfqq = rb_entry(parent, struct cfq_queue, rb_node); rb_key += __cfqq->rb_key; @@ -517,23 +1440,28 @@ static void cfq_service_tree_add(struct cfqq->slice_resid = 0; } else { rb_key = -HZ; - __cfqq = cfq_rb_first(&cfqd->service_tree); + __cfqq = cfq_rb_first(service_tree); rb_key += __cfqq ? __cfqq->rb_key : jiffies; } if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + new_cfqq = 0; /* * same position, nothing more to do */ - if (rb_key == cfqq->rb_key) + if (rb_key == cfqq->rb_key && + cfqq->service_tree == service_tree) return; - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + cfqd->st_counts[cfq_get_idx_by_st(cfqq, orig_group)]--; + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); + cfqq->service_tree = NULL; } left = 1; parent = NULL; - p = &cfqd->service_tree.rb.rb_node; + cfqq->service_tree = service_tree; + p = &service_tree->rb.rb_node; while (*p) { struct rb_node **n; @@ -541,35 +1469,29 @@ static void cfq_service_tree_add(struct __cfqq = rb_entry(parent, struct cfq_queue, rb_node); /* - * sort RT queues first, we always want to give - * preference to them. IDLE queues goes to the back. - * after that, sort on the next service time. + * sort by key, that represents service time. */ - if (cfq_class_rt(cfqq) > cfq_class_rt(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_rt(cfqq) < cfq_class_rt(__cfqq)) - n = &(*p)->rb_right; - else if (cfq_class_idle(cfqq) < cfq_class_idle(__cfqq)) - n = &(*p)->rb_left; - else if (cfq_class_idle(cfqq) > cfq_class_idle(__cfqq)) - n = &(*p)->rb_right; - else if (time_before(rb_key, __cfqq->rb_key)) + if (time_before(rb_key, __cfqq->rb_key)) n = &(*p)->rb_left; - else + else { n = &(*p)->rb_right; - - if (n == &(*p)->rb_right) left = 0; + } p = n; } if (left) - cfqd->service_tree.left = &cfqq->rb_node; + service_tree->left = &cfqq->rb_node; cfqq->rb_key = rb_key; rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color(&cfqq->rb_node, &cfqd->service_tree.rb); + rb_insert_color(&cfqq->rb_node, &service_tree->rb); + service_tree->count++; + cfqd->st_counts[cfq_get_idx_by_st(cfqq, cfqq->cfqg)]++; + if ((add_front || !new_cfqq) && !group_changed) + return; + cfq_group_notify_queue_add(cfqd, cfqq->cfqg); } static struct cfq_queue * @@ -671,13 +1593,17 @@ static void cfq_del_cfqq_rr(struct cfq_d BUG_ON(!cfq_cfqq_on_rr(cfqq)); cfq_clear_cfqq_on_rr(cfqq); - if (!RB_EMPTY_NODE(&cfqq->rb_node)) - cfq_rb_erase(&cfqq->rb_node, &cfqd->service_tree); + if (!RB_EMPTY_NODE(&cfqq->rb_node)) { + cfqd->st_counts[cfq_get_idx_by_st(cfqq, cfqq->cfqg)]--; + cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); + cfqq->service_tree = NULL; + } if (cfqq->p_root) { rb_erase(&cfqq->p_node, cfqq->p_root); cfqq->p_root = NULL; } + cfq_group_notify_queue_del(cfqd, cfqq->cfqg); BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; } @@ -688,7 +1614,6 @@ static void cfq_del_cfqq_rr(struct cfq_d static void cfq_del_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; const int sync = rq_is_sync(rq); BUG_ON(!cfqq->queued[sync]); @@ -696,24 +1621,28 @@ static void cfq_del_rq_rb(struct request elv_rb_del(&cfqq->sort_list, rq); - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { + /* + * Queue will be deleted from service tree when we actually + * expire it later. Right now just remove it from prio tree + * as it is empty. + */ + if (cfqq->p_root) { + rb_erase(&cfqq->p_node, cfqq->p_root); + cfqq->p_root = NULL; + } + } } static void cfq_add_rq_rb(struct request *rq) { struct cfq_queue *cfqq = RQ_CFQQ(rq); struct cfq_data *cfqd = cfqq->cfqd; - struct request *__alias, *prev; + struct request *prev; cfqq->queued[rq_is_sync(rq)]++; - /* - * looks a little odd, but the first insert might return an alias. - * if that happens, put the alias on the dispatch list - */ - while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL) - cfq_dispatch_insert(cfqd->queue, __alias); + elv_rb_add(&cfqq->sort_list, rq); if (!cfq_cfqq_on_rr(cfqq)) cfq_add_cfqq_rr(cfqd, cfqq); @@ -722,7 +1651,7 @@ static void cfq_add_rq_rb(struct request * check if this request is a better next-serve candidate */ prev = cfqq->next_rq; - cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq); + cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); /* * adjust priority tree position, if ->next_rq changes @@ -737,7 +1666,12 @@ static void cfq_reposition_rq_rb(struct { elv_rb_del(&cfqq->sort_list, rq); cfqq->queued[rq_is_sync(rq)]--; + cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(rq), rq_is_sync(rq)); cfq_add_rq_rb(rq); + cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, + &cfqq->cfqd->serving_group->blkg, rq_data_dir(rq), + rq_is_sync(rq)); } static struct request * @@ -765,9 +1699,10 @@ static void cfq_activate_request(struct { struct cfq_data *cfqd = q->elevator->elevator_data; - cfqd->rq_in_driver[rq_is_sync(rq)]++; + cfq_update_stats(cfqd, 1); + cfqd->rq_in_driver++; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - rq_in_driver(cfqd)); + cfqd->rq_in_driver); cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } @@ -775,12 +1710,12 @@ static void cfq_activate_request(struct static void cfq_deactivate_request(struct request_queue *q, struct request *rq) { struct cfq_data *cfqd = q->elevator->elevator_data; - const int sync = rq_is_sync(rq); - WARN_ON(!cfqd->rq_in_driver[sync]); - cfqd->rq_in_driver[sync]--; + WARN_ON(!cfqd->rq_in_driver); + cfq_update_stats(cfqd, -1); + cfqd->rq_in_driver--; cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - rq_in_driver(cfqd)); + cfqd->rq_in_driver); } static void cfq_remove_request(struct request *rq) @@ -794,7 +1729,9 @@ static void cfq_remove_request(struct re cfq_del_rq_rb(rq); cfqq->cfqd->rq_queued--; - if (rq_is_meta(rq)) { + cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(rq), rq_is_sync(rq)); + if (rq->cmd_flags & REQ_META) { WARN_ON(!cfqq->meta_pending); cfqq->meta_pending--; } @@ -825,10 +1762,21 @@ static void cfq_merged_request(struct re } } +static void cfq_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) +{ + cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, + bio_data_dir(bio), cfq_bio_sync(bio)); + + if (get_exec_ub_top() != (RQ_CFQG(req))->blkg.blk_ub) + ub_writeback_io(0, bio_sectors(bio)); +} + static void cfq_merged_requests(struct request_queue *q, struct request *rq, struct request *next) { + struct cfq_queue *cfqq = RQ_CFQQ(rq); /* * reposition in fifo if next is older than rq */ @@ -838,7 +1786,11 @@ cfq_merged_requests(struct request_queue rq_set_fifo_time(rq, rq_fifo_time(next)); } + if (cfqq->next_rq == next) + cfqq->next_rq = rq; cfq_remove_request(next); + cfq_blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, + rq_data_dir(next), rq_is_sync(next)); } static int cfq_allow_merge(struct request_queue *q, struct request *rq, @@ -866,13 +1818,31 @@ static int cfq_allow_merge(struct reques return cfqq == RQ_CFQQ(rq); } +static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + del_timer(&cfqd->idle_slice_timer); + cfq_blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg); +} + static void __cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active"); + cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d", + cfqd->serving_prio, cfqd->serving_type); + cfq_blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg); + cfqq->slice_start = 0; + cfqq->dispatch_start = jiffies; + cfqq->allocated_slice = 0; cfqq->slice_end = 0; cfqq->slice_dispatch = 0; + cfqq->nr_sectors = 0; + + cfqq->first_dispatch = 0; + cfqq->n_dispatched = 0; + if (cfqq->queued[0] + cfqq->queued[1] >= CFQQ_DEEP_THR && + CFQD_IDLE_AUTODETECT(cfqd)) + cfq_mark_cfqq_deep(cfqq); cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_must_dispatch(cfqq); @@ -880,7 +1850,7 @@ static void __cfq_set_active_queue(struc cfq_clear_cfqq_fifo_expire(cfqq); cfq_mark_cfqq_slice_new(cfqq); - del_timer(&cfqd->idle_slice_timer); + cfq_del_timer(cfqd, cfqq); } cfqd->active_queue = cfqq; @@ -895,10 +1865,26 @@ __cfq_slice_expired(struct cfq_data *cfq { cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); + if (cfq_cfqq_deep(cfqq) && CFQD_IDLE_AUTODETECT(cfqd)) { + cfqq->first_dispatch = 0; + cfqq->n_dispatched = 0; + cfq_clear_cfqq_deep(cfqq); + } + if (cfq_cfqq_wait_request(cfqq)) - del_timer(&cfqd->idle_slice_timer); + cfq_del_timer(cfqd, cfqq); cfq_clear_cfqq_wait_request(cfqq); + cfq_clear_cfqq_wait_busy(cfqq); + + /* + * If this cfqq is shared between multiple processes, check to + * make sure that those processes are still issuing I/Os within + * the mean seek distance. If not, it may be time to break the + * queues apart again. + */ + if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) + cfq_mark_cfqq_split_coop(cfqq); /* * store what was left of this slice, if the queue idled/timed out @@ -908,11 +1894,19 @@ __cfq_slice_expired(struct cfq_data *cfq cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid); } + cfq_group_served(cfqd, cfqq->cfqg, cfqq); + + if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) + cfq_del_cfqq_rr(cfqd, cfqq); + cfq_resort_rr_list(cfqd, cfqq); if (cfqq == cfqd->active_queue) cfqd->active_queue = NULL; + if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active) + cfqd->grp_service_tree.active = NULL; + if (cfqd->active_cic) { put_io_context(cfqd->active_cic->ioc); cfqd->active_cic = NULL; @@ -933,10 +1927,39 @@ static inline void cfq_slice_expired(str */ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) { - if (RB_EMPTY_ROOT(&cfqd->service_tree.rb)) + struct cfq_rb_root *service_tree = + service_tree_for(cfqd->serving_group, cfqd->serving_prio, + cfqd->serving_type); + + if (!cfqd->rq_queued) + return NULL; + + /* There is nothing to dispatch */ + if (!service_tree) + return NULL; + if (RB_EMPTY_ROOT(&service_tree->rb)) + return NULL; + return cfq_rb_first(service_tree); +} + +static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) +{ + struct cfq_group *cfqg; + struct cfq_queue *cfqq; + int i, j; + struct cfq_rb_root *st; + + if (!cfqd->rq_queued) + return NULL; + + cfqg = cfq_get_next_cfqg(cfqd); + if (!cfqg) return NULL; - return cfq_rb_first(&cfqd->service_tree); + for_each_cfqg_st(cfqg, i, j, st) + if ((cfqq = cfq_rb_first(st)) != NULL) + return cfqq; + return NULL; } /* @@ -945,14 +1968,8 @@ static struct cfq_queue *cfq_get_next_qu static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) { - if (!cfqq) { + if (!cfqq) cfqq = cfq_get_next_queue(cfqd); - if (cfqq && !cfq_cfqq_coop_preempt(cfqq)) - cfq_clear_cfqq_coop(cfqq); - } - - if (cfqq) - cfq_clear_cfqq_coop_preempt(cfqq); __cfq_set_active_queue(cfqd, cfqq); return cfqq; @@ -967,18 +1984,10 @@ static inline sector_t cfq_dist_from_las return cfqd->last_position - blk_rq_pos(rq); } -#define CIC_SEEK_THR 8 * 1024 -#define CIC_SEEKY(cic) ((cic)->seek_mean > CIC_SEEK_THR) - -static inline int cfq_rq_close(struct cfq_data *cfqd, struct request *rq) +static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, + struct request *rq) { - struct cfq_io_context *cic = cfqd->active_cic; - sector_t sdist = cic->seek_mean; - - if (!sample_valid(cic->seek_samples)) - sdist = CIC_SEEK_THR; - - return cfq_dist_from_last(cfqd, rq) <= sdist; + return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; } static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, @@ -1005,7 +2014,7 @@ static struct cfq_queue *cfqq_close(stru * will contain the closest sector. */ __cfqq = rb_entry(parent, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, __cfqq->next_rq)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; if (blk_rq_pos(__cfqq->next_rq) < sector) @@ -1016,7 +2025,7 @@ static struct cfq_queue *cfqq_close(stru return NULL; __cfqq = rb_entry(node, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, __cfqq->next_rq)) + if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) return __cfqq; return NULL; @@ -1033,16 +2042,21 @@ static struct cfq_queue *cfqq_close(stru * assumption. */ static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq, - bool probe) + struct cfq_queue *cur_cfqq) { struct cfq_queue *cfqq; + if (cfq_class_idle(cur_cfqq)) + return NULL; + if (!cfq_cfqq_sync(cur_cfqq)) + return NULL; + if (CFQQ_SEEKY(cur_cfqq)) + return NULL; + /* - * A valid cfq_io_context is necessary to compare requests against - * the seek_mean of the current cfqq. + * Don't search priority tree if it's the only queue in the group. */ - if (!cfqd->active_cic) + if (cur_cfqq->cfqg->nr_cfqq == 1) return NULL; /* @@ -1054,19 +2068,77 @@ static struct cfq_queue *cfq_close_coope if (!cfqq) return NULL; - if (cfq_cfqq_coop(cfqq)) + /* If new queue belongs to different cfq_group, don't choose it */ + if (cur_cfqq->cfqg != cfqq->cfqg) + return NULL; + + /* + * It only makes sense to merge sync queues. + */ + if (!cfq_cfqq_sync(cfqq)) + return NULL; + if (CFQQ_SEEKY(cfqq)) + return NULL; + + /* + * Do not merge queues of different priority classes + */ + if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) return NULL; - if (!probe) - cfq_mark_cfqq_coop(cfqq); return cfqq; } +/* + * Determine whether we should enforce idle window for this queue. + */ + +static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + enum wl_prio_t prio = cfqq_prio(cfqq); + struct cfq_rb_root *service_tree = cfqq->service_tree; + unsigned count; + + BUG_ON(!service_tree); + BUG_ON(!service_tree->count); + + /* We never do for idle class queues. */ + if (prio == IDLE_WORKLOAD) + return false; + + /* Don't idle if slice idling is disabled by the user */ + if (cfqd->cfq_slice_idle == 0) + return false; + + /* We do for queues that were marked with idle window flag. */ + if (cfq_cfqq_idle_window(cfqq) && + !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) + return true; + + /* + * Otherwise, we do only if they are the last ones + * in their service tree. + * + * If disk is fast enough, we should be last in this type + * of service tree among all cfq-groups as well. + */ + if (CFQD_DISK_LOOKS_FAST(cfqd)) + count = cfqd->st_counts[cfq_get_idx_by_st(cfqq, cfqq->cfqg)]; + else + count = service_tree->count; + + if (count == 1 && cfq_cfqq_sync(cfqq)) + return 1; + cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", + service_tree->count); + return 0; +} + static void cfq_arm_slice_timer(struct cfq_data *cfqd) { struct cfq_queue *cfqq = cfqd->active_queue; struct cfq_io_context *cic; - unsigned long sl; + unsigned long sl, group_idle = 0; /* * SSD device without seek penalty, disable idling. But only do so @@ -1082,13 +2154,18 @@ static void cfq_arm_slice_timer(struct c /* * idle is disabled, either manually or by past process history */ - if (!cfqd->cfq_slice_idle || !cfq_cfqq_idle_window(cfqq)) - return; + if (!cfq_should_idle(cfqd, cfqq)) { + /* no queue idling. Check for group idling */ + if (cfqd->cfq_group_idle) + group_idle = cfqd->cfq_group_idle; + else + return; + } /* - * still requests with the driver, don't idle + * still active requests from this queue, don't idle */ - if (rq_in_driver(cfqd)) + if (cfqq->dispatched) return; /* @@ -1104,22 +2181,27 @@ static void cfq_arm_slice_timer(struct c * time slice. */ if (sample_valid(cic->ttime_samples) && - (cfqq->slice_end - jiffies < cic->ttime_mean)) + (cfqq->slice_end - jiffies < cic->ttime_mean)) { + cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d", + cic->ttime_mean); + return; + } + + /* There are other queues in the group, don't do group idle */ + if (group_idle && cfqq->cfqg->nr_cfqq > 1) return; cfq_mark_cfqq_wait_request(cfqq); - /* - * we don't want to idle for seeks, but we do want to allow - * fair distribution of slice time for a process doing back-to-back - * seeks. so allow a little bit of time for him to submit a new rq - */ - sl = cfqd->cfq_slice_idle; - if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) - sl = min(sl, msecs_to_jiffies(CFQ_MIN_TT)); + if (group_idle) + sl = cfqd->cfq_group_idle; + else + sl = cfqd->cfq_slice_idle; mod_timer(&cfqd->idle_slice_timer, jiffies + sl); - cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl); + cfq_blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg); + cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl, + group_idle ? 1 : 0); } /* @@ -1130,48 +2212,246 @@ static void cfq_dispatch_insert(struct r struct cfq_data *cfqd = q->elevator->elevator_data; struct cfq_queue *cfqq = RQ_CFQQ(rq); - cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); + + if (cfq_cfqq_deep(cfqq) && CFQD_IDLE_AUTODETECT(cfqd)) { + cfqq->n_dispatched++; + if (!cfqq->first_dispatch) + cfqq->first_dispatch = jiffies; + } + + cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); + cfq_remove_request(rq); + cfqq->dispatched++; + (RQ_CFQG(rq))->dispatched++; + elv_dispatch_sort(q, rq); + + cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; + cfqq->nr_sectors += blk_rq_sectors(rq); + cfq_blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq), + rq_data_dir(rq), rq_is_sync(rq)); +} + +/* + * return expired entry, or NULL to just start from scratch in rbtree + */ +static struct request *cfq_check_fifo(struct cfq_queue *cfqq) +{ + struct request *rq = NULL; + + if (cfq_cfqq_fifo_expire(cfqq)) + return NULL; + + cfq_mark_cfqq_fifo_expire(cfqq); + + if (list_empty(&cfqq->fifo)) + return NULL; + + rq = rq_entry_fifo(cfqq->fifo.next); + if (time_before(jiffies, rq_fifo_time(rq))) + rq = NULL; + + cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); + return rq; +} + +static inline int +cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + const int base_rq = cfqd->cfq_slice_async_rq; + + WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); + + return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); +} + +/* + * Must be called with the queue_lock held. + */ +static int cfqq_process_refs(struct cfq_queue *cfqq) +{ + int process_refs, io_refs; + + io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; + process_refs = cfqq->ref - io_refs; + BUG_ON(process_refs < 0); + return process_refs; +} + +static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) +{ + int process_refs, new_process_refs; + struct cfq_queue *__cfqq; + + /* + * If there are no process references on the new_cfqq, then it is + * unsafe to follow the ->new_cfqq chain as other cfqq's in the + * chain may have dropped their last reference (not just their + * last process reference). + */ + if (!cfqq_process_refs(new_cfqq)) + return; + + /* Avoid a circular list and skip interim queue merges */ + while ((__cfqq = new_cfqq->new_cfqq)) { + if (__cfqq == cfqq) + return; + new_cfqq = __cfqq; + } + + process_refs = cfqq_process_refs(cfqq); + new_process_refs = cfqq_process_refs(new_cfqq); + /* + * If the process for the cfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return; + + /* + * Merge in the direction of the lesser amount of work. + */ + if (new_process_refs >= process_refs) { + cfqq->new_cfqq = new_cfqq; + new_cfqq->ref += process_refs; + } else { + new_cfqq->new_cfqq = cfqq; + cfqq->ref += new_process_refs; + } +} + +static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd, + struct cfq_group *cfqg, enum wl_prio_t prio) +{ + struct cfq_queue *queue; + int i; + bool key_valid = false; + unsigned long lowest_key = 0; + enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; + + for (i = 0; i <= SYNC_WORKLOAD; ++i) { + /* select the one with lowest rb_key */ + queue = cfq_rb_first(service_tree_for(cfqg, prio, i)); + if (queue && + (!key_valid || time_before(queue->rb_key, lowest_key))) { + lowest_key = queue->rb_key; + cur_best = i; + key_valid = true; + } + } + + return cur_best; +} + +static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg) +{ + unsigned slice; + unsigned count; + struct cfq_rb_root *st; + unsigned group_slice; + + if (!cfqg) { + cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->workload_expires = jiffies + 1; + return; + } + + /* Choose next priority. RT > BE > IDLE */ + if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) + cfqd->serving_prio = RT_WORKLOAD; + else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) + cfqd->serving_prio = BE_WORKLOAD; + else { + cfqd->serving_prio = IDLE_WORKLOAD; + cfqd->workload_expires = jiffies + 1; + return; + } + + /* + * For RT and BE, we have to choose also the type + * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload + * expiration time + */ + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); + count = st->count; + + /* + * check workload expiration, and that we still have other queues ready + */ + if (count && !time_after(jiffies, cfqd->workload_expires)) + return; + + /* otherwise select new workload type */ + cfqd->serving_type = + cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio); + st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type); + count = st->count; + + /* + * the workload slice is computed as a fraction of target latency + * proportional to the number of queues in that workload, over + * all the queues in the same priority class + */ + group_slice = cfq_group_slice(cfqd, cfqg); + + slice = group_slice * count / + max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio], + cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg)); - cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); - cfq_remove_request(rq); - cfqq->dispatched++; - elv_dispatch_sort(q, rq); + if (cfqd->serving_type == ASYNC_WORKLOAD) { + unsigned int tmp; + + /* + * Async queues are currently system wide. Just taking + * proportion of queues with-in same group will lead to higher + * async ratio system wide as generally root group is going + * to have higher weight. A more accurate thing would be to + * calculate system wide asnc/sync ratio. + */ + tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg); + tmp = tmp/cfqd->busy_queues; + slice = min_t(unsigned, slice, tmp); + + /* async workload slice is scaled down according to + * the sync/async slice ratio. */ + slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1]; + } else + /* sync workload slice is at least 2 * cfq_slice_idle */ + slice = max(slice, 2 * cfqd->cfq_slice_idle); - if (cfq_cfqq_sync(cfqq)) - cfqd->sync_flight++; + slice = max_t(unsigned, slice, CFQ_MIN_TT); + cfq_log(cfqd, "workload slice:%d", slice); + cfqd->workload_expires = jiffies + slice; } -/* - * return expired entry, or NULL to just start from scratch in rbtree - */ -static struct request *cfq_check_fifo(struct cfq_queue *cfqq) +static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) { - struct request *rq = NULL; - - if (cfq_cfqq_fifo_expire(cfqq)) - return NULL; - - cfq_mark_cfqq_fifo_expire(cfqq); + struct cfq_rb_root *st = &cfqd->grp_service_tree; + struct cfq_group *cfqg; - if (list_empty(&cfqq->fifo)) + if (RB_EMPTY_ROOT(&st->rb)) return NULL; - - rq = rq_entry_fifo(cfqq->fifo.next); - if (time_before(jiffies, rq_fifo_time(rq))) - rq = NULL; - - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); - return rq; + cfqg = cfq_rb_first_group(st); + st->active = &cfqg->rb_node; + update_min_vdisktime(st); + return cfqg; } -static inline int -cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) +static void cfq_choose_cfqg(struct cfq_data *cfqd) { - const int base_rq = cfqd->cfq_slice_async_rq; + struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); - WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); + cfqd->serving_group = cfqg; - return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio)); + /* Restore the workload type data */ + if (cfqg->saved_workload_slice) { + cfqd->workload_expires = jiffies + cfqg->saved_workload_slice; + cfqd->serving_type = cfqg->saved_workload; + cfqd->serving_prio = cfqg->saved_serving_prio; + } else + cfqd->workload_expires = jiffies - 1; + + choose_service_tree(cfqd, cfqg); } /* @@ -1186,36 +2466,110 @@ static struct cfq_queue *cfq_select_queu if (!cfqq) goto new_queue; + if (!cfqd->rq_queued) + return NULL; + /* - * The active queue has run out of time, expire it and select new. + * We were waiting for group to get backlogged. Expire the queue */ - if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) + if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) goto expire; /* + * The active queue has run out of time, expire it and select new. + */ + if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { + /* + * If slice had not expired at the completion of last request + * we might not have turned on wait_busy flag. Don't expire + * the queue yet. Allow the group to get backlogged. + * + * The very fact that we have used the slice, that means we + * have been idling all along on this queue and it should be + * ok to wait for this request to complete. + */ + if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) + && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { + cfqq = NULL; + goto keep_queue; + } else + goto check_group_idle; + } + + if (CFQD_IDLE_AUTODETECT(cfqd) && cfq_cfqq_deep(cfqq) && + cfqq->n_dispatched >= CFQQ_DEEP_THR) { + if (cfqq->first_dispatch == jiffies) + cfqd->cfq_disk_looks_fast++; + else + cfqd->cfq_disk_looks_slow++; + + cfqq->first_dispatch = 0; + cfqq->n_dispatched = 0; + cfq_clear_cfqq_deep(cfqq); + cfqd->cfq_disk_last_updated = jiffies; + } + + /* * The active queue has requests and isn't expired, allow it to * dispatch. */ if (!RB_EMPTY_ROOT(&cfqq->sort_list)) goto keep_queue; + if (CFQD_IDLE_AUTODETECT(cfqd)) { + if (cfq_cfqq_deep(cfqq)) { + cfqq->first_dispatch = 0; + cfqq->n_dispatched = 0; + cfq_clear_cfqq_deep(cfqq); + } + + if ((cfqd->cfq_disk_last_updated && + cfqd->cfq_fast_slow_expiration_rate > 0 && + jiffies - cfqd->cfq_disk_last_updated > + HZ * cfqd->cfq_fast_slow_expiration_rate) || + cfqd->cfq_disk_looks_fast > 128 || + cfqd->cfq_disk_looks_slow > 128) { + cfqd->cfq_disk_looks_fast >>= 1; + cfqd->cfq_disk_looks_slow >>= 1; + cfqd->cfq_disk_last_updated = jiffies; + } + } + /* * If another queue has a request waiting within our mean seek * distance, let it run. The expire code will check for close * cooperators and put the close queue at the front of the service - * tree. + * tree. If possible, merge the expiring queue with the new cfqq. */ - new_cfqq = cfq_close_cooperator(cfqd, cfqq, 0); - if (new_cfqq) + new_cfqq = cfq_close_cooperator(cfqd, cfqq); + if (new_cfqq) { + if (!cfqq->new_cfqq) + cfq_setup_merge(cfqq, new_cfqq); goto expire; + } /* * No requests pending. If the active queue still has requests in * flight or is idling for a new request, allow either of these * conditions to happen (or time out) before selecting a new queue. */ - if (timer_pending(&cfqd->idle_slice_timer) || - (cfqq->dispatched && cfq_cfqq_idle_window(cfqq))) { + if (timer_pending(&cfqd->idle_slice_timer)) { + cfqq = NULL; + goto keep_queue; + } + + if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { + cfqq = NULL; + goto keep_queue; + } + + /* + * If group idle is enabled and there are requests dispatched from + * this group, wait for requests to complete. + */ +check_group_idle: + if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 + && cfqq->cfqg->dispatched && !CFQD_DISK_LOOKS_FAST(cfqd)) { cfqq = NULL; goto keep_queue; } @@ -1223,6 +2577,13 @@ static struct cfq_queue *cfq_select_queu expire: cfq_slice_expired(cfqd, 0); new_queue: + /* + * Current queue expired. Check if we have to switch to a new + * service tree + */ + if (!new_cfqq) + cfq_choose_cfqg(cfqd); + cfqq = cfq_set_active_queue(cfqd, new_cfqq); keep_queue: return cfqq; @@ -1238,6 +2599,9 @@ static int __cfq_forced_dispatch_cfqq(st } BUG_ON(!list_empty(&cfqq->fifo)); + + /* By default cfqq is not expired if it is empty. Do it explicitly */ + __cfq_slice_expired(cfqq->cfqd, cfqq, 0); return dispatched; } @@ -1250,10 +2614,12 @@ static int cfq_forced_dispatch(struct cf struct cfq_queue *cfqq; int dispatched = 0; - while ((cfqq = cfq_rb_first(&cfqd->service_tree)) != NULL) - dispatched += __cfq_forced_dispatch_cfqq(cfqq); - + /* Expire the timeslice of the current active queue first */ cfq_slice_expired(cfqd, 0); + while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { + __cfq_set_active_queue(cfqd, cfqq); + dispatched += __cfq_forced_dispatch_cfqq(cfqq); + } BUG_ON(cfqd->busy_queues); @@ -1261,6 +2627,19 @@ static int cfq_forced_dispatch(struct cf return dispatched; } +static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, + struct cfq_queue *cfqq) +{ + /* the queue hasn't finished any request, can't estimate */ + if (cfq_cfqq_slice_new(cfqq)) + return 1; + if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched, + cfqq->slice_end)) + return 1; + + return 0; +} + static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; @@ -1268,16 +2647,16 @@ static bool cfq_may_dispatch(struct cfq_ /* * Drain async requests before we start sync IO */ - if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC]) + if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) return false; /* * If this is an async queue and we have sync IO in flight, let it wait */ - if (cfqd->sync_flight && !cfq_cfqq_sync(cfqq)) + if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) return false; - max_dispatch = cfqd->cfq_quantum; + max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); if (cfq_class_idle(cfqq)) max_dispatch = 1; @@ -1294,13 +2673,22 @@ static bool cfq_may_dispatch(struct cfq_ /* * We have other queues, don't allow more IO from this one */ - if (cfqd->busy_queues > 1) + if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq)) return false; /* - * Sole queue user, allow bigger slice + * Sole queue user, no limit */ - max_dispatch *= 4; + if (cfqd->busy_queues == 1) + max_dispatch = -1; + else + /* + * Normally we start throttling cfqq when cfq_quantum/2 + * requests have been dispatched. But we can drive + * deeper queue depths at the beginning of slice + * subjected to upper limit of cfq_quantum. + * */ + max_dispatch = cfqd->cfq_quantum; } /* @@ -1309,7 +2697,7 @@ static bool cfq_may_dispatch(struct cfq_ * based on the last sync IO we serviced */ if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { - unsigned long last_sync = jiffies - cfqd->last_end_sync_rq; + unsigned long last_sync = jiffies - cfqd->last_delayed_sync; unsigned int depth; depth = last_sync / cfqd->cfq_slice[1]; @@ -1407,28 +2795,36 @@ static int cfq_dispatch_requests(struct * task holds one reference to the queue, dropped when task exits. each rq * in-flight on this queue also holds a reference, dropped when rq is freed. * + * Each cfq queue took a reference on the parent group. Drop it now. * queue lock must be held here. */ static void cfq_put_queue(struct cfq_queue *cfqq) { struct cfq_data *cfqd = cfqq->cfqd; + struct cfq_group *cfqg, *orig_cfqg; - BUG_ON(atomic_read(&cfqq->ref) <= 0); + BUG_ON(cfqq->ref <= 0); - if (!atomic_dec_and_test(&cfqq->ref)) + cfqq->ref--; + if (cfqq->ref) return; cfq_log_cfqq(cfqd, cfqq, "put_queue"); BUG_ON(rb_first(&cfqq->sort_list)); BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - BUG_ON(cfq_cfqq_on_rr(cfqq)); + cfqg = cfqq->cfqg; + orig_cfqg = cfqq->orig_cfqg; if (unlikely(cfqd->active_queue == cfqq)) { __cfq_slice_expired(cfqd, cfqq, 0); cfq_schedule_dispatch(cfqd); } + BUG_ON(cfq_cfqq_on_rr(cfqq)); kmem_cache_free(cfq_pool, cfqq); + cfq_put_cfqg(cfqg); + if (orig_cfqg) + cfq_put_cfqg(orig_cfqg); } /* @@ -1518,11 +2914,29 @@ static void cfq_free_io_context(struct i static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) { + struct cfq_queue *__cfqq, *next; + if (unlikely(cfqq == cfqd->active_queue)) { __cfq_slice_expired(cfqd, cfqq, 0); cfq_schedule_dispatch(cfqd); } + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. + */ + __cfqq = cfqq->new_cfqq; + while (__cfqq) { + if (__cfqq == cfqq) { + WARN(1, "cfqq->new_cfqq loop detected\n"); + break; + } + next = __cfqq->new_cfqq; + cfq_put_queue(__cfqq); + __cfqq = next; + } + cfq_put_queue(cfqq); } @@ -1540,8 +2954,11 @@ static void __cfq_exit_single_io_context cic->dead_key = (unsigned long) cic->key; cic->key = NULL; - if (ioc->ioc_data == cic) + if (rcu_dereference(ioc->ioc_data) == cic) { + spin_lock(&ioc->lock); rcu_assign_pointer(ioc->ioc_data, NULL); + spin_unlock(&ioc->lock); + } if (cic->cfqq[BLK_RW_ASYNC]) { cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]); @@ -1690,7 +3107,7 @@ static void cfq_init_cfqq(struct cfq_dat RB_CLEAR_NODE(&cfqq->p_node); INIT_LIST_HEAD(&cfqq->fifo); - atomic_set(&cfqq->ref, 0); + cfqq->ref = 0; cfqq->cfqd = cfqd; cfq_mark_cfqq_prio_changed(cfqq); @@ -1703,14 +3120,51 @@ static void cfq_init_cfqq(struct cfq_dat cfqq->pid = pid; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic) +{ + struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1); + struct cfq_data *cfqd = cic->key; + unsigned long flags; + struct request_queue *q; + + if (unlikely(!cfqd)) + return; + + q = cfqd->queue; + + spin_lock_irqsave(q->queue_lock, flags); + + if (sync_cfqq) { + /* + * Drop reference to sync queue. A new sync queue will be + * assigned in new group upon arrival of a fresh request. + */ + cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup"); + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(sync_cfqq); + } + + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void cfq_ioc_set_cgroup(struct io_context *ioc) +{ + call_for_each_cic(ioc, changed_cgroup); + ioc->cgroup_changed = 0; +} +#endif /* CONFIG_CFQ_GROUP_IOSCHED */ + static struct cfq_queue * cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc, gfp_t gfp_mask) { struct cfq_queue *cfqq, *new_cfqq = NULL; struct cfq_io_context *cic; + struct cfq_group *cfqg; retry: + cfqg = cfq_get_cfqg(cfqd); cic = cfq_cic_lookup(cfqd, ioc); /* cic always exists here */ cfqq = cic_to_cfqq(cic, is_sync); @@ -1741,6 +3195,7 @@ retry: if (cfqq) { cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); cfq_init_prio_data(cfqq, ioc); + cfq_link_cfqq_cfqg(cfqq, cfqg); cfq_log_cfqq(cfqd, cfqq, "alloced"); } else cfqq = &cfqd->oom_cfqq; @@ -1788,11 +3243,11 @@ cfq_get_queue(struct cfq_data *cfqd, boo * pin the queue now that it's allocated, scheduler exit will prune it */ if (!is_sync && !(*async_cfqq)) { - atomic_inc(&cfqq->ref); + cfqq->ref++; *async_cfqq = cfqq; } - atomic_inc(&cfqq->ref); + cfqq->ref++; return cfqq; } @@ -1872,7 +3327,7 @@ static int cfq_cic_link(struct cfq_data unsigned long flags; int ret; - ret = radix_tree_preload(gfp_mask); + ret = radix_tree_maybe_preload(gfp_mask); if (!ret) { cic->ioc = ioc; cic->key = cfqd; @@ -1893,7 +3348,7 @@ static int cfq_cic_link(struct cfq_data } } - if (ret) + if (ret && ret != -EEXIST) printk(KERN_ERR "cfq: cic link failed!\n"); return ret; @@ -1909,6 +3364,7 @@ cfq_get_io_context(struct cfq_data *cfqd { struct io_context *ioc = NULL; struct cfq_io_context *cic; + int ret; might_sleep_if(gfp_mask & __GFP_WAIT); @@ -1916,6 +3372,7 @@ cfq_get_io_context(struct cfq_data *cfqd if (!ioc) return NULL; +retry: cic = cfq_cic_lookup(cfqd, ioc); if (cic) goto out; @@ -1924,7 +3381,12 @@ cfq_get_io_context(struct cfq_data *cfqd if (cic == NULL) goto err; - if (cfq_cic_link(cfqd, ioc, cic, gfp_mask)) + ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask); + if (ret == -EEXIST) { + /* someone has linked cic to ioc already */ + cfq_cic_free(cic); + goto retry; + } else if (ret) goto err_free; out: @@ -1932,6 +3394,10 @@ out: if (unlikely(ioc->ioprio_changed)) cfq_ioc_set_ioprio(ioc); +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (unlikely(ioc->cgroup_changed)) + cfq_ioc_set_cgroup(ioc); +#endif return cic; err_free: cfq_cic_free(cic); @@ -1952,33 +3418,23 @@ cfq_update_io_thinktime(struct cfq_data } static void -cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, +cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct request *rq) { - sector_t sdist; - u64 total; - - if (!cic->last_request_pos) - sdist = 0; - else if (cic->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - cic->last_request_pos; - else - sdist = cic->last_request_pos - blk_rq_pos(rq); + sector_t sdist = 0; + sector_t n_sec = blk_rq_sectors(rq); + if (cfqq->last_request_pos) { + if (cfqq->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - cfqq->last_request_pos; + else + sdist = cfqq->last_request_pos - blk_rq_pos(rq); + } - /* - * Don't allow the seek distance to get too large from the - * odd fragment, pagein, etc - */ - if (cic->seek_samples <= 60) /* second&third seek */ - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*1024); + cfqq->seek_history <<= 1; + if (blk_queue_nonrot(cfqd->queue)) + cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); else - sdist = min(sdist, (cic->seek_mean * 4) + 2*1024*64); - - cic->seek_samples = (7*cic->seek_samples + 256) / 8; - cic->seek_total = (7*cic->seek_total + (u64)256*sdist) / 8; - total = cic->seek_total + (cic->seek_samples/2); - do_div(total, cic->seek_samples); - cic->seek_mean = (sector_t)total; + cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); } /* @@ -1999,14 +3455,20 @@ cfq_update_idle_window(struct cfq_data * enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); - if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || - (!cfqd->cfq_latency && cfqd->hw_tag && CIC_SEEKY(cic))) + if (cfqq->queued[0] + cfqq->queued[1] >= CFQQ_DEEP_THR && + (!CFQD_IDLE_AUTODETECT(cfqd) || cfq_cfqq_slice_new(cfqq))) + cfq_mark_cfqq_deep(cfqq); + + if (CFQD_DISK_LOOKS_FAST(cfqd)) + enable_idle = 0; + else if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE)) + enable_idle = 0; + else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle || + ((!cfq_cfqq_deep(cfqq) || CFQD_IDLE_AUTODETECT(cfqd)) + && CFQQ_SEEKY(cfqq))) enable_idle = 0; else if (sample_valid(cic->ttime_samples)) { - unsigned int slice_idle = cfqd->cfq_slice_idle; - if (sample_valid(cic->seek_samples) && CIC_SEEKY(cic)) - slice_idle = msecs_to_jiffies(CFQ_MIN_TT); - if (cic->ttime_mean > slice_idle) + if (cic->ttime_mean > cfqd->cfq_slice_idle) enable_idle = 0; else enable_idle = 1; @@ -2035,9 +3497,6 @@ cfq_should_preempt(struct cfq_data *cfqd if (!cfqq) return false; - if (cfq_slice_used(cfqq)) - return true; - if (cfq_class_idle(new_cfqq)) return false; @@ -2051,11 +3510,24 @@ cfq_should_preempt(struct cfq_data *cfqd if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) return true; + /* Allow preemption only if we are idling on sync-noidle tree */ + if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD && + cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && + new_cfqq->service_tree->count == 1 + (new_cfqq->cfqg == cfqq->cfqg) && + RB_EMPTY_ROOT(&cfqq->sort_list)) + return true; + + if (new_cfqq->cfqg != cfqq->cfqg) + return false; + + if (cfq_slice_used(cfqq)) + return true; + /* * So both queues are sync. Let the new request get disk time if * it's a metadata request and the current queue is doing regular IO. */ - if (rq_is_meta(rq) && !cfqq->meta_pending) + if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending) return true; /* @@ -2071,16 +3543,8 @@ cfq_should_preempt(struct cfq_data *cfqd * if this request is as-good as one we would expect from the * current cfqq, let it preempt */ - if (cfq_rq_close(cfqd, rq) && (!cfq_cfqq_coop(new_cfqq) || - cfqd->busy_queues == 1)) { - /* - * Mark new queue coop_preempt, so its coop flag will not be - * cleared when new queue gets scheduled at the very first time - */ - cfq_mark_cfqq_coop_preempt(new_cfqq); - cfq_mark_cfqq_coop(new_cfqq); + if (cfq_rq_close(cfqd, cfqq, rq)) return true; - } return false; } @@ -2117,14 +3581,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, s struct cfq_io_context *cic = RQ_CIC(rq); cfqd->rq_queued++; - if (rq_is_meta(rq)) + if (rq->cmd_flags & REQ_META) cfqq->meta_pending++; cfq_update_io_thinktime(cfqd, cic); - cfq_update_io_seektime(cfqd, cic, rq); + cfq_update_io_seektime(cfqd, cfqq, rq); cfq_update_idle_window(cfqd, cfqq, cic); - cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); if (cfqq == cfqd->active_queue) { /* @@ -2140,10 +3604,14 @@ cfq_rq_enqueued(struct cfq_data *cfqd, s if (cfq_cfqq_wait_request(cfqq)) { if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || cfqd->busy_queues > 1) { - del_timer(&cfqd->idle_slice_timer); - __blk_run_queue(cfqd->queue); + cfq_del_timer(cfqd, cfqq); + cfq_clear_cfqq_wait_request(cfqq); + __blk_run_queue(cfqd->queue); + } else { + cfq_blkiocg_update_idle_time_stats( + &cfqq->cfqg->blkg); + cfq_mark_cfqq_must_dispatch(cfqq); } - cfq_mark_cfqq_must_dispatch(cfqq); } } else if (cfq_should_preempt(cfqd, cfqq, rq)) { /* @@ -2165,10 +3633,17 @@ static void cfq_insert_request(struct re cfq_log_cfqq(cfqd, cfqq, "insert_request"); cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc); - cfq_add_rq_rb(rq); - rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &cfqq->fifo); + cfq_add_rq_rb(rq); + cfq_blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg, + &cfqd->serving_group->blkg, rq_data_dir(rq), + rq_is_sync(rq)); + + if (get_exec_ub_top() != (RQ_CFQG(rq))->blkg.blk_ub) + ub_writeback_io(1, blk_rq_sectors(rq)); + + virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_OP_ACCOUNT, NULL); cfq_rq_enqueued(cfqd, cfqq, rq); } @@ -2179,23 +3654,102 @@ static void cfq_insert_request(struct re */ static void cfq_update_hw_tag(struct cfq_data *cfqd) { - if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak) - cfqd->rq_in_driver_peak = rq_in_driver(cfqd); + struct cfq_queue *cfqq = cfqd->active_queue; + + if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) + cfqd->hw_tag_est_depth = cfqd->rq_in_driver; + + if (cfqd->hw_tag == 1) + return; if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN) + cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) + return; + + /* + * If active queue hasn't enough requests and can idle, cfq might not + * dispatch sufficient requests to hardware. Don't zero hw_tag in this + * case + */ + if (cfqq && cfq_cfqq_idle_window(cfqq) && + cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < + CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) return; if (cfqd->hw_tag_samples++ < 50) return; - if (cfqd->rq_in_driver_peak >= CFQ_HW_QUEUE_MIN) + if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) cfqd->hw_tag = 1; else cfqd->hw_tag = 0; +} + +static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) +{ + struct cfq_io_context *cic = cfqd->active_cic; + + /* If there are other queues in the group, don't wait */ + if (cfqq->cfqg->nr_cfqq > 1) + return false; + + if (cfq_slice_used(cfqq)) + return true; + + /* if slice left is less than think time, wait busy */ + if (cic && sample_valid(cic->ttime_samples) + && (cfqq->slice_end - jiffies < cic->ttime_mean)) + return true; + + /* + * If think times is less than a jiffy than ttime_mean=0 and above + * will not be true. It might happen that slice has not expired yet + * but will expire soon (4-5 ns) during select_queue(). To cover the + * case where think time is less than a jiffy, mark the queue wait + * busy if only 1 jiffy is left in the slice. + */ + if (cfqq->slice_end - jiffies == 1) + return true; + + return false; +} + +static void +avg_calc_load(struct cfq_data *cfqd, int exp_idx, + unsigned long queued, unsigned long indriver) +{ + queued <<= FSHIFT; + indriver <<= FSHIFT; + + CALC_LOAD(cfqd->cfq_avg_queued[0], EXP_ARR[0][exp_idx], queued); + CALC_LOAD(cfqd->cfq_avg_queued[1], EXP_ARR[1][exp_idx], queued); + CALC_LOAD(cfqd->cfq_avg_queued[2], EXP_ARR[2][exp_idx], queued); + + CALC_LOAD(cfqd->cfq_avg_indriver[0], EXP_ARR[0][exp_idx], indriver); + CALC_LOAD(cfqd->cfq_avg_indriver[1], EXP_ARR[1][exp_idx], indriver); + CALC_LOAD(cfqd->cfq_avg_indriver[2], EXP_ARR[2][exp_idx], indriver); +} + +static void cfq_update_stats(struct cfq_data *cfqd, int indrv_delta) +{ + unsigned long now = jiffies; + + if (time_before(now, cfqd->cfq_calc_load_update)) + goto done; + + if (now - cfqd->cfq_calc_load_update >= CFQD_LOAD_FREQ) { + int idx = clamp(fls((now - cfqd->cfq_calc_load_update) + / CFQD_LOAD_FREQ)-1, 0, EXP_ARR_SIZ-1); + avg_calc_load(cfqd, idx, cfqd->cfq_queued_last, + cfqd->cfq_indriver_last); + } - cfqd->hw_tag_samples = 0; - cfqd->rq_in_driver_peak = 0; + avg_calc_load(cfqd, 0, cfqd->rq_queued, cfqd->rq_in_driver); + + cfqd->cfq_calc_load_update = now + CFQD_LOAD_FREQ; +done: + cfqd->cfq_queued_last = cfqd->rq_queued; + cfqd->cfq_indriver_last = cfqd->rq_in_driver + indrv_delta; } static void cfq_completed_request(struct request_queue *q, struct request *rq) @@ -2206,21 +3760,27 @@ static void cfq_completed_request(struct unsigned long now; now = jiffies; - cfq_log_cfqq(cfqd, cfqq, "complete"); + cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", + !!(rq->cmd_flags & REQ_NOIDLE)); cfq_update_hw_tag(cfqd); + cfq_update_stats(cfqd, -1); - WARN_ON(!cfqd->rq_in_driver[sync]); + WARN_ON(!cfqd->rq_in_driver); WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver[sync]--; + cfqd->rq_in_driver--; cfqq->dispatched--; + (RQ_CFQG(rq))->dispatched--; + cfq_blkiocg_update_completion_stats(&cfqq->cfqg->blkg, + rq_start_time_ns(rq), rq_io_start_time_ns(rq), + rq_data_dir(rq), rq_is_sync(rq)); - if (cfq_cfqq_sync(cfqq)) - cfqd->sync_flight--; + cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; if (sync) { RQ_CIC(rq)->last_end_request = now; - cfqd->last_end_sync_rq = now; + if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now)) + cfqd->last_delayed_sync = now; } /* @@ -2234,21 +3794,37 @@ static void cfq_completed_request(struct cfq_set_prio_slice(cfqd, cfqq); cfq_clear_cfqq_slice_new(cfqq); } + + /* + * Should we wait for next request to come in before we expire + * the queue. + */ + if (cfq_should_wait_busy(cfqd, cfqq)) { + unsigned long extend_sl = cfqd->cfq_slice_idle; + if (!cfqd->cfq_slice_idle) + extend_sl = cfqd->cfq_group_idle; + cfqq->slice_end = jiffies + extend_sl; + cfq_mark_cfqq_wait_busy(cfqq); + cfq_log_cfqq(cfqd, cfqq, "will busy wait"); + } + /* - * If there are no requests waiting in this queue, and - * there are other queues ready to issue requests, AND - * those other queues are issuing requests within our - * mean seek distance, give them a chance to run instead - * of idling. + * Idling is not enabled on: + * - expired queues + * - idle-priority queues + * - async queues + * - queues with still some requests queued + * - when there is a close cooperator */ if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) cfq_slice_expired(cfqd, 1); - else if (cfqq_empty && !cfq_close_cooperator(cfqd, cfqq, 1) && - sync && !rq_noidle(rq)) + else if (sync && cfqq_empty && + !cfq_close_cooperator(cfqd, cfqq)) { cfq_arm_slice_timer(cfqd); + } } - if (!rq_in_driver(cfqd)) + if (!cfqd->rq_in_driver) cfq_schedule_dispatch(cfqd); } @@ -2269,12 +3845,10 @@ static void cfq_prio_boost(struct cfq_qu cfqq->ioprio = IOPRIO_NORM; } else { /* - * check if we need to unboost the queue + * unboost the queue (if needed) */ - if (cfqq->ioprio_class != cfqq->org_ioprio_class) - cfqq->ioprio_class = cfqq->org_ioprio_class; - if (cfqq->ioprio != cfqq->org_ioprio) - cfqq->ioprio = cfqq->org_ioprio; + cfqq->ioprio_class = cfqq->org_ioprio_class; + cfqq->ioprio = cfqq->org_ioprio; } } @@ -2285,6 +3859,17 @@ static inline int __cfq_may_queue(struct return ELV_MQUEUE_MUST; } +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* + * force queueing first queue in the group after long sleep + */ + if (!cfq_cfqq_must_alloc_slice(cfqq) && cfqq->cfqg->nr_cfqq == 0 && + cfqg_key(&cfqq->cfqd->grp_service_tree, cfqq->cfqg) < 0) { + cfq_mark_cfqq_must_alloc_slice(cfqq); + return ELV_MQUEUE_MUST; + } +#endif + return ELV_MQUEUE_MAY; } @@ -2331,13 +3916,46 @@ static void cfq_put_request(struct reque put_io_context(RQ_CIC(rq)->ioc); - rq->elevator_private = NULL; - rq->elevator_private2 = NULL; + rq->elevator_private[0] = NULL; + rq->elevator_private[1] = NULL; + + /* Put down rq reference on cfqg */ + cfq_put_cfqg(RQ_CFQG(rq)); + rq->elevator_private[2] = NULL; cfq_put_queue(cfqq); } } +static struct cfq_queue * +cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, + struct cfq_queue *cfqq) +{ + cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); + cic_set_cfqq(cic, cfqq->new_cfqq, 1); + cfq_mark_cfqq_coop(cfqq->new_cfqq); + cfq_put_queue(cfqq); + return cic_to_cfqq(cic, 1); +} + +/* + * Returns NULL if a new cfqq should be allocated, or the old cfqq if this + * was the last process referring to said cfqq. + */ +static struct cfq_queue * +split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) +{ + if (cfqq_process_refs(cfqq) == 1) { + cfqq->pid = current->pid; + cfq_clear_cfqq_coop(cfqq); + cfq_clear_cfqq_split_coop(cfqq); + return cfqq; + } + + cic_set_cfqq(cic, NULL, 1); + cfq_put_queue(cfqq); + return NULL; +} /* * Allocate cfq data structures associated with this request. */ @@ -2360,19 +3978,41 @@ cfq_set_request(struct request_queue *q, if (!cic) goto queue_fail; +new_queue: cfqq = cic_to_cfqq(cic, is_sync); if (!cfqq || cfqq == &cfqd->oom_cfqq) { cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask); cic_set_cfqq(cic, cfqq, is_sync); + } else { + /* + * If the queue was seeky for too long, break it apart. + */ + if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { + cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); + cfqq = split_cfqq(cic, cfqq); + if (!cfqq) + goto new_queue; + } + + /* + * Check to see if this queue is scheduled to merge with + * another, closely cooperating queue. The merging of + * queues happens here as it must be done in process context. + * The reference on new_cfqq was taken in merge_cfqqs. + */ + if (cfqq->new_cfqq) + cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); } cfqq->allocated[rw]++; - atomic_inc(&cfqq->ref); + cfqq->ref++; + + rq->elevator_private[0] = cic; + rq->elevator_private[1] = cfqq; + rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg); spin_unlock_irqrestore(q->queue_lock, flags); - rq->elevator_private = cic; - rq->elevator_private2 = cfqq; return 0; queue_fail: @@ -2438,6 +4078,11 @@ static void cfq_idle_slice_timer(unsigne */ if (!RB_EMPTY_ROOT(&cfqq->sort_list)) goto out_kick; + + /* + * Queue depth flag is reset only when the idle didn't succeed + */ + cfq_clear_cfqq_deep(cfqq); } expire: cfq_slice_expired(cfqd, timed_out); @@ -2472,6 +4117,7 @@ static void cfq_exit_queue(struct elevat { struct cfq_data *cfqd = e->elevator_data; struct request_queue *q = cfqd->queue; + bool wait = false; cfq_shutdown_timer_wq(cfqd); @@ -2489,25 +4135,95 @@ static void cfq_exit_queue(struct elevat } cfq_put_async_queues(cfqd); + cfq_release_cfq_groups(cfqd); + + /* + * If there are groups which we could not unlink from blkcg list, + * wait for a rcu period for them to be freed. + */ + if (cfqd->nr_blkcg_linked_grps) + wait = true; spin_unlock_irq(q->queue_lock); cfq_shutdown_timer_wq(cfqd); + /* + * Wait for cfqg->blkg->key accessors to exit their grace periods. + * Do this wait only if there are other unlinked groups out + * there. This can happen if cgroup deletion path claimed the + * responsibility of cleaning up a group before queue cleanup code + * get to the group. + * + * Do not call synchronize_rcu() unconditionally as there are drivers + * which create/delete request queue hundreds of times during scan/boot + * and synchronize_rcu() can take significant time and slow down boot. + */ + if (wait) + synchronize_rcu(); + + BUG_ON(cfqd->nr_blkcg_linked_grps); + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* Free up per cpu stats for root group */ + blkio_free_blkg_stats(&cfqd->root_group.blkg); +#endif kfree(cfqd); } static void *cfq_init_queue(struct request_queue *q) { struct cfq_data *cfqd; - int i; + int i, j; + struct cfq_group *cfqg; + struct cfq_rb_root *st; cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node); if (!cfqd) return NULL; + /* + * Don't need take queue_lock in the routine, since we are + * initializing the ioscheduler, and nobody is using cfqd + */ + + /* Init root service tree */ + cfqd->grp_service_tree = CFQ_RB_ROOT; + + /* Init root group */ + cfqg = &cfqd->root_group; + for_each_cfqg_st(cfqg, i, j, st) + *st = CFQ_RB_ROOT; + RB_CLEAR_NODE(&cfqg->rb_node); + + /* Give preference to root group over other groups */ + cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT; + +#ifdef CONFIG_CFQ_GROUP_IOSCHED + /* + * Set root group reference to 2. One reference will be dropped when + * all groups on cfqd->cfqg_list are being deleted during queue exit. + * Other reference will remain there as we don't want to delete this + * group as it is statically allocated and gets destroyed when + * throtl_data goes away. + */ + cfqg->ref = 2; + + if (blkio_alloc_blkg_stats(&cfqg->blkg)) { + kfree(cfqg); + kfree(cfqd); + return NULL; + } - cfqd->service_tree = CFQ_RB_ROOT; + rcu_read_lock(); + + cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, + (void *)cfqd, 0); + rcu_read_unlock(); + cfqd->nr_blkcg_linked_grps++; + /* Add group on cfqd->cfqg_list */ + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list); +#endif /* * Not strictly needed (since RB_ROOT just clears the node and we * zeroed cfqd on alloc), but better be safe in case someone decides @@ -2522,7 +4238,8 @@ static void *cfq_init_queue(struct reque * will not attempt to free it. */ cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); - atomic_inc(&cfqd->oom_cfqq.ref); + cfqd->oom_cfqq.ref++; + cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group); INIT_LIST_HEAD(&cfqd->cic_list); @@ -2543,9 +4260,17 @@ static void *cfq_init_queue(struct reque cfqd->cfq_slice[1] = cfq_slice_sync; cfqd->cfq_slice_async_rq = cfq_slice_async_rq; cfqd->cfq_slice_idle = cfq_slice_idle; + cfqd->cfq_group_idle = cfq_group_idle; cfqd->cfq_latency = 1; - cfqd->hw_tag = 1; - cfqd->last_end_sync_rq = jiffies; + cfqd->cfq_group_isolation = 1; + cfqd->hw_tag = -1; + cfqd->cfq_fast_slow_expiration_rate = cfq_fast_slow_expiration_rate; + cfqd->cfq_calc_load_update = jiffies + CFQD_LOAD_FREQ; + /* + * we optimistically start assuming sync ops weren't delayed in last + * second, in order to have larger depth for async operations. + */ + cfqd->last_delayed_sync = jiffies - HZ; return cfqd; } @@ -2586,6 +4311,21 @@ cfq_var_show(unsigned int var, char *pag return sprintf(page, "%d\n", var); } +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + +static ssize_t +cfq_var_avg_show(unsigned long *var, char *page) +{ + unsigned long avg1 = var[0] + FIXED_1/200; + unsigned long avg2 = var[1] + FIXED_1/200; + unsigned long avg3 = var[2] + FIXED_1/200; + return sprintf(page, "%lu.%02lu %lu.%02lu %lu.%02lu\n", + LOAD_INT(avg1), LOAD_FRAC(avg1), + LOAD_INT(avg2), LOAD_FRAC(avg2), + LOAD_INT(avg3), LOAD_FRAC(avg3)); +} + static ssize_t cfq_var_store(unsigned int *var, const char *page, size_t count) { @@ -2604,16 +4344,36 @@ static ssize_t __FUNC(struct elevator_qu __data = jiffies_to_msecs(__data); \ return cfq_var_show(__data, (page)); \ } + +#define SHOW_FUNCTION_AVG(__FUNC, __VAR) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct cfq_data *cfqd = e->elevator_data; \ + unsigned long *__data = __VAR; \ + spin_lock_irq(cfqd->queue->queue_lock); \ + cfq_update_stats(cfqd, 0); \ + spin_unlock_irq(cfqd->queue->queue_lock); \ + return cfq_var_avg_show(__data, (page)); \ +} SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); +SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); +SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0); +SHOW_FUNCTION(cfq_enable_idle_for_deep_show, cfqd->cfq_enable_idle_for_deep, 0); +SHOW_FUNCTION(cfq_disk_looks_fast_show, cfqd->cfq_disk_looks_fast, 0); +SHOW_FUNCTION(cfq_disk_looks_slow_show, cfqd->cfq_disk_looks_slow, 0); +SHOW_FUNCTION(cfq_fast_slow_expiration_rate_show, cfqd->cfq_fast_slow_expiration_rate, 0); +SHOW_FUNCTION_AVG(cfq_queued_avg_show, cfqd->cfq_avg_queued); +SHOW_FUNCTION_AVG(cfq_in_driver_avg_show, cfqd->cfq_avg_indriver); +SHOW_FUNCTION(cfq_hw_tag_show, cfqd->hw_tag, 0); #undef SHOW_FUNCTION #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -2632,6 +4392,14 @@ static ssize_t __FUNC(struct elevator_qu *(__PTR) = __data; \ return ret; \ } + +#define STORE_FUNCTION_AVG(__FUNC, __PTR) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct cfq_data *cfqd = e->elevator_data; \ + __PTR[0] = __PTR[1] = __PTR[2] = 0; \ + return count; \ +} STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, UINT_MAX, 1); @@ -2641,11 +4409,20 @@ STORE_FUNCTION(cfq_back_seek_max_store, STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); +STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, UINT_MAX, 0); STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); +STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0); +STORE_FUNCTION(cfq_enable_idle_for_deep_store, &cfqd->cfq_enable_idle_for_deep, 0, UINT_MAX, 0); +STORE_FUNCTION(cfq_disk_looks_fast_store, &cfqd->cfq_disk_looks_fast, 0, UINT_MAX, 0); +STORE_FUNCTION(cfq_disk_looks_slow_store, &cfqd->cfq_disk_looks_slow, 0, UINT_MAX, 0); +STORE_FUNCTION(cfq_fast_slow_expiration_rate_store, &cfqd->cfq_fast_slow_expiration_rate, 0, UINT_MAX, 0); +STORE_FUNCTION_AVG(cfq_queued_avg_store, cfqd->cfq_avg_queued); +STORE_FUNCTION_AVG(cfq_in_driver_avg_store, cfqd->cfq_avg_indriver); +STORE_FUNCTION(cfq_hw_tag_store, &cfqd->hw_tag, 0, UINT_MAX, 0); #undef STORE_FUNCTION #define CFQ_ATTR(name) \ @@ -2661,7 +4438,16 @@ static struct elv_fs_entry cfq_attrs[] = CFQ_ATTR(slice_async), CFQ_ATTR(slice_async_rq), CFQ_ATTR(slice_idle), + CFQ_ATTR(group_idle), CFQ_ATTR(low_latency), + CFQ_ATTR(group_isolation), + CFQ_ATTR(enable_idle_for_deep), + CFQ_ATTR(disk_looks_fast), + CFQ_ATTR(disk_looks_slow), + CFQ_ATTR(fast_slow_expiration_rate), + CFQ_ATTR(queued_avg), + CFQ_ATTR(in_driver_avg), + CFQ_ATTR(hw_tag), __ATTR_NULL }; @@ -2671,6 +4457,7 @@ static struct elevator_type iosched_cfq .elevator_merged_fn = cfq_merged_request, .elevator_merge_req_fn = cfq_merged_requests, .elevator_allow_merge_fn = cfq_allow_merge, + .elevator_bio_merged_fn = cfq_bio_merged, .elevator_dispatch_fn = cfq_dispatch_requests, .elevator_add_req_fn = cfq_insert_request, .elevator_activate_req_fn = cfq_activate_request, @@ -2691,6 +4478,18 @@ static struct elevator_type iosched_cfq .elevator_owner = THIS_MODULE, }; +#ifdef CONFIG_CFQ_GROUP_IOSCHED +static struct blkio_policy_type blkio_policy_cfq = { + .ops = { + .blkio_unlink_group_fn = cfq_unlink_blkio_group, + .blkio_update_group_weight_fn = cfq_update_blkio_group_weight, + }, + .plid = BLKIO_POLICY_PROP, +}; +#else +static struct blkio_policy_type blkio_policy_cfq; +#endif + static int __init cfq_init(void) { /* @@ -2701,10 +4500,17 @@ static int __init cfq_init(void) if (!cfq_slice_idle) cfq_slice_idle = 1; +#ifdef CONFIG_CFQ_GROUP_IOSCHED + if (!cfq_group_idle) + cfq_group_idle = 1; +#else + cfq_group_idle = 0; +#endif if (cfq_slab_setup()) return -ENOMEM; elv_register(&iosched_cfq); + blkio_policy_register(&blkio_policy_cfq); return 0; } @@ -2712,6 +4518,7 @@ static int __init cfq_init(void) static void __exit cfq_exit(void) { DECLARE_COMPLETION_ONSTACK(all_gone); + blkio_policy_unregister(&blkio_policy_cfq); elv_unregister(&iosched_cfq); ioc_gone = &all_gone; /* ioc_gone's update must be visible before reading ioc_count */ diff -uprN linux-2.6.32/block/compat_ioctl.c linux-2.6.32.ovz/block/compat_ioctl.c --- linux-2.6.32/block/compat_ioctl.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/compat_ioctl.c 2015-03-10 13:21:48.000000000 -0700 @@ -747,6 +747,8 @@ long compat_blkdev_ioctl(struct file *fi return compat_put_uint(arg, bdev_io_opt(bdev)); case BLKALIGNOFF: return compat_put_int(arg, bdev_alignment_offset(bdev)); + case BLKDISCARDZEROES: + return compat_put_uint(arg, bdev_discard_zeroes_data(bdev)); case BLKFLSBUF: case BLKROSET: case BLKDISCARD: @@ -790,13 +792,13 @@ long compat_blkdev_ioctl(struct file *fi bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: - size = bdev->bd_inode->i_size; + size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) return -EFBIG; return compat_put_ulong(arg, size >> 9); case BLKGETSIZE64_32: - return compat_put_u64(arg, bdev->bd_inode->i_size); + return compat_put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESETUP32: lock_kernel(); diff -uprN linux-2.6.32/block/deadline-iosched.c linux-2.6.32.ovz/block/deadline-iosched.c --- linux-2.6.32/block/deadline-iosched.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/deadline-iosched.c 2015-06-23 04:16:16.000000000 -0700 @@ -13,6 +13,7 @@ #include #include #include +#include /* * See Documentation/block/deadline-iosched.txt @@ -77,10 +78,8 @@ static void deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) { struct rb_root *root = deadline_rb_root(dd, rq); - struct request *__alias; - while (unlikely(__alias = elv_rb_add(root, rq))) - deadline_move_request(dd, __alias); + elv_rb_add(root, rq); } static inline void @@ -110,6 +109,8 @@ deadline_add_request(struct request_queu */ rq_set_fifo_time(rq, jiffies + dd->fifo_expire[data_dir]); list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + ub_writeback_io(1, blk_rq_sectors(rq)); + virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_OP_ACCOUNT, NULL); } /* @@ -188,6 +189,12 @@ deadline_merged_requests(struct request_ deadline_remove_request(q, next); } +static void deadline_bio_merged(struct request_queue *q, struct request *req, + struct bio *bio) +{ + ub_writeback_io(0, bio_sectors(bio)); +} + /* * move request from sort list to dispatch queue. */ @@ -443,6 +450,7 @@ static struct elevator_type iosched_dead .elevator_merge_fn = deadline_merge, .elevator_merged_fn = deadline_merged_request, .elevator_merge_req_fn = deadline_merged_requests, + .elevator_bio_merged_fn = deadline_bio_merged, .elevator_dispatch_fn = deadline_dispatch_requests, .elevator_add_req_fn = deadline_add_request, .elevator_queue_empty_fn = deadline_queue_empty, diff -uprN linux-2.6.32/block/elevator.c linux-2.6.32.ovz/block/elevator.c --- linux-2.6.32/block/elevator.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/elevator.c 2015-06-23 04:16:16.000000000 -0700 @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -71,34 +70,9 @@ static int elv_iosched_allow_merge(struc /* * can we safely merge with this request? */ -int elv_rq_merge_ok(struct request *rq, struct bio *bio) +bool elv_rq_merge_ok(struct request *rq, struct bio *bio) { - if (!rq_mergeable(rq)) - return 0; - - /* - * Don't merge file system requests and discard requests - */ - if (bio_rw_flagged(bio, BIO_RW_DISCARD) != - bio_rw_flagged(rq->bio, BIO_RW_DISCARD)) - return 0; - - /* - * different data direction or already started, don't merge - */ - if (bio_data_dir(bio) != rq_data_dir(rq)) - return 0; - - /* - * must be same device and not a special request - */ - if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special) - return 0; - - /* - * only merge integrity protected bio into ditto rq - */ - if (bio_integrity(bio) != blk_integrity_rq(rq)) + if (!blk_rq_merge_ok(rq, bio)) return 0; if (!elv_iosched_allow_merge(rq, bio)) @@ -108,23 +82,6 @@ int elv_rq_merge_ok(struct request *rq, } EXPORT_SYMBOL(elv_rq_merge_ok); -static inline int elv_try_merge(struct request *__rq, struct bio *bio) -{ - int ret = ELEVATOR_NO_MERGE; - - /* - * we can merge and sequence is ok, check if it's possible - */ - if (elv_rq_merge_ok(__rq, bio)) { - if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector) - ret = ELEVATOR_BACK_MERGE; - else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector) - ret = ELEVATOR_FRONT_MERGE; - } - - return ret; -} - static struct elevator_type *elevator_find(const char *name) { struct elevator_type *e; @@ -248,9 +205,11 @@ int elevator_init(struct request_queue * { struct elevator_type *e = NULL; struct elevator_queue *eq; - int ret = 0; void *data; + if (unlikely(q->elevator)) + return 0; + INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; q->end_sector = 0; @@ -290,7 +249,7 @@ int elevator_init(struct request_queue * } elevator_attach(q, eq, data); - return ret; + return 0; } EXPORT_SYMBOL(elevator_init); @@ -357,7 +316,7 @@ static struct request *elv_rqhash_find(s * RB-tree support functions for inserting/lookup/removal of requests * in a sorted RB tree. */ -struct request *elv_rb_add(struct rb_root *root, struct request *rq) +void elv_rb_add(struct rb_root *root, struct request *rq) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -369,15 +328,12 @@ struct request *elv_rb_add(struct rb_roo if (blk_rq_pos(rq) < blk_rq_pos(__rq)) p = &(*p)->rb_left; - else if (blk_rq_pos(rq) > blk_rq_pos(__rq)) + else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) p = &(*p)->rb_right; - else - return __rq; } rb_link_node(&rq->rb_node, parent, p); rb_insert_color(&rq->rb_node, root); - return NULL; } EXPORT_SYMBOL(elv_rb_add); @@ -432,7 +388,8 @@ void elv_dispatch_sort(struct request_qu list_for_each_prev(entry, &q->queue_head) { struct request *pos = list_entry_rq(entry); - if (blk_discard_rq(rq) != blk_discard_rq(pos)) + if ((rq->cmd_flags & REQ_DISCARD) != + (pos->cmd_flags & REQ_DISCARD)) break; if (rq_data_dir(rq) != rq_data_dir(pos)) break; @@ -482,8 +439,8 @@ int elv_merge(struct request_queue *q, s /* * First try one-hit cache. */ - if (q->last_merge) { - ret = elv_try_merge(q->last_merge, bio); + if (q->last_merge && elv_rq_merge_ok(q->last_merge, bio)) { + ret = blk_try_merge(q->last_merge, bio); if (ret != ELEVATOR_NO_MERGE) { *req = q->last_merge; return ret; @@ -536,6 +493,15 @@ void elv_merge_requests(struct request_q q->last_merge = rq; } +void elv_bio_merged(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e->ops->elevator_bio_merged_fn) + e->ops->elevator_bio_merged_fn(q, rq, bio); +} + void elv_requeue_request(struct request_queue *q, struct request *rq) { /* @@ -544,7 +510,7 @@ void elv_requeue_request(struct request_ */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (blk_sorted_rq(rq)) + if (rq->cmd_flags & REQ_SORTED) elv_deactivate_rq(q, rq); } @@ -556,49 +522,39 @@ void elv_requeue_request(struct request_ void elv_drain_elevator(struct request_queue *q) { static int printed; + + lockdep_assert_held(q->queue_lock); + while (q->elevator->ops->elevator_dispatch_fn(q, 1)) ; - if (q->nr_sorted == 0) - return; - if (printed++ < 10) { + if (q->nr_sorted && printed++ < 10) { printk(KERN_ERR "%s: forced dispatching is broken " "(nr_sorted=%u), please report this\n", q->elevator->elevator_type->elevator_name, q->nr_sorted); } } -/* - * Call with queue lock held, interrupts disabled - */ void elv_quiesce_start(struct request_queue *q) { if (!q->elevator) return; + spin_lock_irq(q->queue_lock); queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); + spin_unlock_irq(q->queue_lock); - /* - * make sure we don't have any requests in flight - */ - elv_drain_elevator(q); - while (q->rq.elvpriv) { - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); - msleep(10); - spin_lock_irq(q->queue_lock); - elv_drain_elevator(q); - } + blk_drain_queue(q, false); } void elv_quiesce_end(struct request_queue *q) { + spin_lock_irq(q->queue_lock); queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); + spin_unlock_irq(q->queue_lock); } void elv_insert(struct request_queue *q, struct request *rq, int where) { - struct list_head *pos; - unsigned ordseq; int unplug_it = 1; trace_block_rq_insert(q, rq); @@ -606,9 +562,16 @@ void elv_insert(struct request_queue *q, rq->q = q; switch (where) { + case ELEVATOR_INSERT_REQUEUE: + /* + * Most requeues happen because of a busy condition, + * don't force unplug of the queue for that case. + * Clear unplug_it and fall through. + */ + unplug_it = 0; + case ELEVATOR_INSERT_FRONT: rq->cmd_flags |= REQ_SOFTBARRIER; - list_add(&rq->queuelist, &q->queue_head); break; @@ -630,7 +593,7 @@ void elv_insert(struct request_queue *q, break; case ELEVATOR_INSERT_SORT: - BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq)); + BUG_ON(rq->cmd_type != REQ_TYPE_FS); rq->cmd_flags |= REQ_SORTED; q->nr_sorted++; if (rq_mergeable(rq)) { @@ -647,34 +610,9 @@ void elv_insert(struct request_queue *q, q->elevator->ops->elevator_add_req_fn(q, rq); break; - case ELEVATOR_INSERT_REQUEUE: - /* - * If ordered flush isn't in progress, we do front - * insertion; otherwise, requests should be requeued - * in ordseq order. - */ + case ELEVATOR_INSERT_FLUSH: rq->cmd_flags |= REQ_SOFTBARRIER; - - /* - * Most requeues happen because of a busy condition, - * don't force unplug of the queue for that case. - */ - unplug_it = 0; - - if (q->ordseq == 0) { - list_add(&rq->queuelist, &q->queue_head); - break; - } - - ordseq = blk_ordered_req_seq(rq); - - list_for_each(pos, &q->queue_head) { - struct request *pos_rq = list_entry_rq(pos); - if (ordseq <= blk_ordered_req_seq(pos_rq)) - break; - } - - list_add_tail(&rq->queuelist, pos); + blk_insert_flush(rq); break; default: @@ -695,27 +633,9 @@ void elv_insert(struct request_queue *q, void __elv_add_request(struct request_queue *q, struct request *rq, int where, int plug) { - if (q->ordcolor) - rq->cmd_flags |= REQ_ORDERED_COLOR; - if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) { - /* - * toggle ordered color - */ - if (blk_barrier_rq(rq)) - q->ordcolor ^= 1; - - /* - * barriers implicitly indicate back insertion - */ - if (where == ELEVATOR_INSERT_SORT) - where = ELEVATOR_INSERT_BACK; - - /* - * this request is scheduling boundary, update - * end_sector - */ - if (blk_fs_request(rq) || blk_discard_rq(rq)) { + /* barriers are scheduling boundary, update end_sector */ + if (rq->cmd_type == REQ_TYPE_FS) { q->end_sector = rq_end_sector(rq); q->boundary_rq = rq; } @@ -780,7 +700,7 @@ int elv_set_request(struct request_queue if (e->ops->elevator_set_req_fn) return e->ops->elevator_set_req_fn(q, rq, gfp_mask); - rq->elevator_private = NULL; + rq->elevator_private[0] = NULL; return 0; } @@ -806,6 +726,8 @@ void elv_abort_queue(struct request_queu { struct request *rq; + blk_abort_flushes(q); + while (!list_empty(&q->queue_head)) { rq = list_entry_rq(q->queue_head.next); rq->cmd_flags |= REQ_QUIET; @@ -829,27 +751,10 @@ void elv_completed_request(struct reques */ if (blk_account_rq(rq)) { q->in_flight[rq_is_sync(rq)]--; - if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn) + if ((rq->cmd_flags & REQ_SORTED) && + e->ops->elevator_completed_req_fn) e->ops->elevator_completed_req_fn(q, rq); } - - /* - * Check if the queue is waiting for fs requests to be - * drained for flush sequence. - */ - if (unlikely(q->ordseq)) { - struct request *next = NULL; - - if (!list_empty(&q->queue_head)) - next = list_entry_rq(q->queue_head.next); - - if (!queue_in_flight(q) && - blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN && - (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) { - blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0); - __blk_run_queue(q); - } - } } #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) @@ -889,7 +794,7 @@ elv_attr_store(struct kobject *kobj, str return error; } -static struct sysfs_ops elv_sysfs_ops = { +static const struct sysfs_ops elv_sysfs_ops = { .show = elv_attr_show, .store = elv_attr_store, }; @@ -915,14 +820,17 @@ int elv_register_queue(struct request_qu } } kobject_uevent(&e->kobj, KOBJ_ADD); + e->registered = 1; } return error; } +EXPORT_SYMBOL(elv_register_queue); static void __elv_unregister_queue(struct elevator_queue *e) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); + e->registered = 0; } void elv_unregister_queue(struct request_queue *q) @@ -930,6 +838,7 @@ void elv_unregister_queue(struct request if (q) __elv_unregister_queue(q->elevator); } +EXPORT_SYMBOL(elv_unregister_queue); void elv_register(struct elevator_type *e) { @@ -959,12 +868,12 @@ void elv_unregister(struct elevator_type */ if (e->ops.trim) { read_lock(&tasklist_lock); - do_each_thread(g, p) { + do_each_thread_all(g, p) { task_lock(p); if (p->io_context) e->ops.trim(p->io_context); task_unlock(p); - } while_each_thread(g, p); + } while_each_thread_all(g, p); read_unlock(&tasklist_lock); } @@ -984,24 +893,24 @@ static int elevator_switch(struct reques { struct elevator_queue *old_elevator, *e; void *data; + int err; /* * Allocate new elevator */ e = elevator_alloc(q, new_e); if (!e) - return 0; + return -ENOMEM; data = elevator_init_queue(q, e); if (!data) { kobject_put(&e->kobj); - return 0; + return -ENOMEM; } /* * Turn on BYPASS and drain all requests w/ elevator private data */ - spin_lock_irq(q->queue_lock); elv_quiesce_start(q); /* @@ -1012,26 +921,27 @@ static int elevator_switch(struct reques /* * attach and start new elevator */ + spin_lock_irq(q->queue_lock); elevator_attach(q, e, data); - spin_unlock_irq(q->queue_lock); - __elv_unregister_queue(old_elevator); + if (old_elevator->registered) { + __elv_unregister_queue(old_elevator); - if (elv_register_queue(q)) - goto fail_register; + err = elv_register_queue(q); + if (err) + goto fail_register; + } /* * finally exit old elevator and turn off BYPASS. */ elevator_exit(old_elevator); - spin_lock_irq(q->queue_lock); elv_quiesce_end(q); - spin_unlock_irq(q->queue_lock); blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name); - return 1; + return 0; fail_register: /* @@ -1041,22 +951,21 @@ fail_register: elevator_exit(e); q->elevator = old_elevator; elv_register_queue(q); + elv_quiesce_end(q); - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_ELVSWITCH, q); - spin_unlock_irq(q->queue_lock); - - return 0; + return err; } -ssize_t elv_iosched_store(struct request_queue *q, const char *name, - size_t count) +/* + * Switch this queue to the given IO scheduler. + */ +int elevator_change(struct request_queue *q, const char *name) { char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; if (!q->elevator) - return count; + return -ENXIO; strlcpy(elevator_name, name, sizeof(elevator_name)); e = elevator_get(strstrip(elevator_name)); @@ -1067,13 +976,27 @@ ssize_t elv_iosched_store(struct request if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) { elevator_put(e); - return count; + return 0; } - if (!elevator_switch(q, e)) - printk(KERN_ERR "elevator: switch to %s failed\n", - elevator_name); - return count; + return elevator_switch(q, e); +} +EXPORT_SYMBOL(elevator_change); + +ssize_t elv_iosched_store(struct request_queue *q, const char *name, + size_t count) +{ + int ret; + + if (!q->elevator) + return count; + + ret = elevator_change(q, name); + if (!ret) + return count; + + printk(KERN_ERR "elevator: switch to %s failed\n", name); + return ret; } ssize_t elv_iosched_show(struct request_queue *q, char *name) @@ -1083,7 +1006,7 @@ ssize_t elv_iosched_show(struct request_ struct elevator_type *__e; int len = 0; - if (!q->elevator) + if (!q->elevator || !blk_queue_stackable(q)) return sprintf(name, "none\n"); elv = e->elevator_type; diff -uprN linux-2.6.32/block/genhd.c linux-2.6.32.ovz/block/genhd.c --- linux-2.6.32/block/genhd.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/genhd.c 2015-06-23 04:16:16.000000000 -0700 @@ -18,13 +18,12 @@ #include #include #include +#include #include "blk.h" static DEFINE_MUTEX(block_class_lock); -#ifndef CONFIG_SYSFS_DEPRECATED struct kobject *block_depr; -#endif /* for extended dynamic devt allocation, currently only one major is used */ #define MAX_EXT_DEVT (1 << MINORBITS) @@ -35,7 +34,7 @@ struct kobject *block_depr; static DEFINE_MUTEX(ext_devt_mutex); static DEFINE_IDR(ext_devt_idr); -static struct device_type disk_type; +struct device_type disk_type; /** * disk_get_part - get partition @@ -151,7 +150,7 @@ struct hd_struct *disk_part_iter_next(st part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!part->nr_sects && + if (!part_nr_sects_read(part) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) @@ -188,7 +187,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { return part->start_sect <= sector && - sector < part->start_sect + part->nr_sects; + sector < part->start_sect + part_nr_sects_read(part); } /** @@ -230,6 +229,37 @@ struct hd_struct *disk_map_sector_rcu(st } EXPORT_SYMBOL_GPL(disk_map_sector_rcu); +int is_same_part(struct gendisk *disk, sector_t sector1, sector_t sector2, + struct hd_struct **part1, struct hd_struct **part2) +{ + struct disk_part_tbl *ptbl; + struct hd_struct *part; + int i = 1; + + *part1 = *part2 = NULL; + + ptbl = rcu_dereference(disk->part_tbl); + part = rcu_dereference(ptbl->last_lookup); + + do { + if (part) { + if (sector_in_part(part, sector1)) + *part1 = part; + if (sector_in_part(part, sector2)) + *part2 = part; + + if (*part1 && *part2) + /* we found both partitions */ + return *part1 == *part2; + } + + part = rcu_dereference(ptbl->part[i]); + } while (i++ < ptbl->len); + + /* we did not found at least one partition */ + return *part1 == *part2; +} + /* * Can be deleted altogether. Later. * @@ -253,8 +283,12 @@ void blkdev_show(struct seq_file *seqf, if (offset < BLKDEV_MAJOR_HASH_SIZE) { mutex_lock(&block_class_lock); - for (dp = major_names[offset]; dp; dp = dp->next) + for (dp = major_names[offset]; dp; dp = dp->next) { + if (!devcgroup_device_visible(S_IFBLK, dp->major, + 0, INT_MAX)) + continue; seq_printf(seqf, "%3d %s\n", dp->major, dp->name); + } mutex_unlock(&block_class_lock); } } @@ -417,14 +451,18 @@ int blk_alloc_devt(struct hd_struct *par do { if (!idr_pre_get(&ext_devt_idr, GFP_KERNEL)) return -ENOMEM; + mutex_lock(&ext_devt_mutex); rc = idr_get_new(&ext_devt_idr, part, &idx); + mutex_unlock(&ext_devt_mutex); } while (rc == -EAGAIN); if (rc) return rc; if (idx > MAX_EXT_DEVT) { + mutex_lock(&ext_devt_mutex); idr_remove(&ext_devt_idr, idx); + mutex_unlock(&ext_devt_mutex); return -EBUSY; } @@ -541,13 +579,21 @@ void add_disk(struct gendisk *disk) disk->major = MAJOR(devt); disk->first_minor = MINOR(devt); + /* Register BDI before referencing it from bdev */ + bdi = &disk->queue->backing_dev_info; + bdi_register_dev(bdi, disk_devt(disk)); + blk_register_region(disk_devt(disk), disk->minors, NULL, exact_match, exact_lock, disk); register_disk(disk); blk_register_queue(disk); - bdi = &disk->queue->backing_dev_info; - bdi_register_dev(bdi, disk_devt(disk)); + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(!blk_get_request_queue(disk->queue)); + retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj, "bdi"); WARN_ON(retval); @@ -661,7 +707,7 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part->nr_sects >> 1, + (unsigned long long)part_nr_sects_read(part) >> 1, disk_name(disk, part->partno, name_buf)); if (is_part0) { if (disk->driverfs_dev != NULL && @@ -753,7 +799,7 @@ static int show_partition(struct seq_fil while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part->nr_sects >> 1, + (unsigned long long)part_nr_sects_read(part) >> 1, disk_name(sgp, part->partno, buf)); disk_part_iter_exit(&piter); @@ -793,7 +839,7 @@ static int __init genhd_device_init(void { int error; - block_class.dev_kobj = sysfs_dev_block_kobj; + block_class.dev_kobj = ve_sysfs_dev_block_kobj; error = class_register(&block_class); if (unlikely(error)) return error; @@ -802,10 +848,10 @@ static int __init genhd_device_init(void register_blkdev(BLOCK_EXT_MAJOR, "blkext"); -#ifndef CONFIG_SYSFS_DEPRECATED - /* create top-level block dir */ - block_depr = kobject_create_and_add("block", NULL); -#endif + if (!sysfs_deprecated) + /* create top-level block dir */ + block_depr = kobject_create_and_add("block", NULL); + return 0; } @@ -861,12 +907,23 @@ static ssize_t disk_alignment_offset_sho return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); } +static ssize_t disk_discard_alignment_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show, + NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); @@ -887,6 +944,7 @@ static struct attribute *disk_attrs[] = &dev_attr_ro.attr, &dev_attr_size.attr, &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, &dev_attr_inflight.attr, @@ -992,11 +1050,14 @@ static void disk_release(struct device * kfree(disk->random); disk_replace_part_tbl(disk, NULL); free_part_stats(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); kfree(disk); } struct class block_class = { .name = "block", }; +EXPORT_SYMBOL(block_class); static char *block_devnode(struct device *dev, mode_t *mode) { @@ -1007,12 +1068,13 @@ static char *block_devnode(struct device return NULL; } -static struct device_type disk_type = { +struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, .devnode = block_devnode, }; +EXPORT_SYMBOL(disk_type); #ifdef CONFIG_PROC_FS /* @@ -1267,7 +1329,7 @@ int invalidate_partition(struct gendisk struct block_device *bdev = bdget_disk(disk, partno); if (bdev) { fsync_bdev(bdev); - res = __invalidate_device(bdev); + res = __invalidate_device(bdev, true); bdput(bdev); } return res; diff -uprN linux-2.6.32/block/ioctl.c linux-2.6.32.ovz/block/ioctl.c --- linux-2.6.32/block/ioctl.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/ioctl.c 2015-06-23 04:16:16.000000000 -0700 @@ -12,12 +12,13 @@ static int blkpg_ioctl(struct block_devi { struct block_device *bdevp; struct gendisk *disk; - struct hd_struct *part; + struct hd_struct *part, *lpart; struct blkpg_ioctl_arg a; struct blkpg_partition p; struct disk_part_iter piter; long long start, length; int partno; + int err; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -91,6 +92,71 @@ static int blkpg_ioctl(struct block_devi bdput(bdevp); return 0; + case BLKPG_RESIZE_PARTITION: + start = p.start >> 9; + length = p.length >> 9; + /* check for fit in a hd_struct */ + if (sizeof(sector_t) == sizeof(long) && + sizeof(long long) > sizeof(long)) { + long pstart = start, plength = length; + if (pstart != start || plength != length + || pstart < 0 || plength < 0) + return -EINVAL; + } + + part = disk_get_part(disk, partno); + if (!part) + return -ENXIO; + bdevp = bdget(part_devt(part)); + if (!bdevp) { + disk_put_part(part); + return -ENOMEM; + } + + err = 0; + mutex_lock(&bdevp->bd_mutex); + mutex_lock_nested(&bdev->bd_mutex, 1); + + if (start != part->start_sect) { + err = -EINVAL; + goto resize_done; + } + /* overlap? */ + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY); + while ((lpart = disk_part_iter_next(&piter))) { + if (lpart->partno != partno && + !(start + length <= lpart->start_sect || + start >= lpart->start_sect + lpart->nr_sects)) { + disk_part_iter_exit(&piter); + err = -EBUSY; + goto resize_done; + } + } + disk_part_iter_exit(&piter); + part_nr_sects_write(part, length); + i_size_write(bdevp->bd_inode, p.length); + resize_done: + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); + disk_put_part(part); + return err; + case BLKPG_GET_PARTITION: + mutex_lock(&bdev->bd_mutex); + part = disk_get_part(disk, partno); + if (!part) + { + mutex_unlock(&bdev->bd_mutex); + return -ENXIO; + } + p.start = part->start_sect << 9; + p.length = part->nr_sects << 9; + disk_put_part(part); + mutex_unlock(&bdev->bd_mutex); + if (copy_to_user(a.data, &p, sizeof(struct blkpg_partition))) + return -EFAULT; + return 0; default: return -EINVAL; } @@ -122,10 +188,9 @@ static int blk_ioctl_discard(struct bloc start >>= 9; len >>= 9; - if (start + len > (bdev->bd_inode->i_size >> 9)) + if (start + len > (i_size_read(bdev->bd_inode) >> 9)) return -EINVAL; - return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, - DISCARD_FL_WAIT); + return blkdev_issue_discard(bdev, start, len, GFP_KERNEL, 0); } static int put_ushort(unsigned long arg, unsigned short val) @@ -280,6 +345,8 @@ int blkdev_ioctl(struct block_device *bd return put_uint(arg, bdev_io_opt(bdev)); case BLKALIGNOFF: return put_int(arg, bdev_alignment_offset(bdev)); + case BLKDISCARDZEROES: + return put_uint(arg, bdev_discard_zeroes_data(bdev)); case BLKSECTGET: return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); case BLKRASET: @@ -316,12 +383,12 @@ int blkdev_ioctl(struct block_device *bd unlock_kernel(); break; case BLKGETSIZE: - size = bdev->bd_inode->i_size; + size = i_size_read(bdev->bd_inode); if ((size >> 9) > ~0UL) return -EFBIG; return put_ulong(arg, size >> 9); case BLKGETSIZE64: - return put_u64(arg, bdev->bd_inode->i_size); + return put_u64(arg, i_size_read(bdev->bd_inode)); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACESETUP: diff -uprN linux-2.6.32/block/Kconfig linux-2.6.32.ovz/block/Kconfig --- linux-2.6.32/block/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/Kconfig 2015-03-10 13:22:36.000000000 -0700 @@ -65,6 +65,16 @@ config BLK_DEV_BSG If unsure, say Y. +config BLK_DEV_BSGLIB + bool "Block layer SG support v4 helper lib" + default n + select BLK_DEV_BSG + help + Subsystems will normally enable this if needed. Users will not + normally need to manually enable this. + + If unsure, say N. + config BLK_DEV_INTEGRITY bool "Block layer data integrity support" ---help--- @@ -77,6 +87,18 @@ config BLK_DEV_INTEGRITY T10/SCSI Data Integrity Field or the T13/ATA External Path Protection. If in doubt, say N. +config BLK_DEV_THROTTLING + bool "Block layer bio throttling support" + depends on BLK_CGROUP=y && EXPERIMENTAL + default n + ---help--- + Block layer bio throttling support. It can be used to limit + the IO rate to a device. IO rate policies are per cgroup and + one needs to mount and use blkio cgroup controller for creating + cgroups and specifying per device IO rate policies. + + See Documentation/cgroups/blkio-controller.txt for more information. + endif # BLOCK config BLOCK_COMPAT diff -uprN linux-2.6.32/block/Kconfig.iosched linux-2.6.32.ovz/block/Kconfig.iosched --- linux-2.6.32/block/Kconfig.iosched 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/Kconfig.iosched 2015-03-10 13:21:21.000000000 -0700 @@ -33,12 +33,24 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" + # If BLK_CGROUP is a module, CFQ has to be built as module. + depends on (BLK_CGROUP=m && m) || !BLK_CGROUP || BLK_CGROUP=y default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally among all processes in the system. It should provide a fair - working environment, suitable for desktop systems. + and low latency working environment, suitable for both desktop + and server systems. + This is the default I/O scheduler. + Note: If BLK_CGROUP=m, then CFQ can be built only as module. + +config CFQ_GROUP_IOSCHED + bool "CFQ Group Scheduling support" + depends on IOSCHED_CFQ && BLK_CGROUP + default n + ---help--- + Enable group IO scheduling in CFQ. choice prompt "Default I/O scheduler" diff -uprN linux-2.6.32/block/Makefile linux-2.6.32.ovz/block/Makefile --- linux-2.6.32/block/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/Makefile 2015-03-10 13:22:36.000000000 -0700 @@ -3,11 +3,14 @@ # obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \ - blk-barrier.o blk-settings.o blk-ioc.o blk-map.o \ + blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o ioctl.o genhd.o scsi_ioctl.o + blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o +obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o obj-$(CONFIG_IOSCHED_AS) += as-iosched.o obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o diff -uprN linux-2.6.32/block/scsi_ioctl.c linux-2.6.32.ovz/block/scsi_ioctl.c --- linux-2.6.32/block/scsi_ioctl.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/block/scsi_ioctl.c 2015-03-10 13:23:54.000000000 -0700 @@ -33,8 +33,8 @@ #include struct blk_cmd_filter { - unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; - unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; + u32 read_ok[BLK_SCSI_MAX_CMDS]; + u32 write_ok[BLK_SCSI_MAX_CMDS]; } blk_default_cmd_filter; /* Command group 3 is reserved and should never be used. */ @@ -112,105 +112,359 @@ static int sg_emulated_host(struct reque static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) { - /* Basic read-only commands */ - __set_bit(TEST_UNIT_READY, filter->read_ok); - __set_bit(REQUEST_SENSE, filter->read_ok); - __set_bit(READ_6, filter->read_ok); - __set_bit(READ_10, filter->read_ok); - __set_bit(READ_12, filter->read_ok); - __set_bit(READ_16, filter->read_ok); - __set_bit(READ_BUFFER, filter->read_ok); - __set_bit(READ_DEFECT_DATA, filter->read_ok); - __set_bit(READ_CAPACITY, filter->read_ok); - __set_bit(READ_LONG, filter->read_ok); - __set_bit(INQUIRY, filter->read_ok); - __set_bit(MODE_SENSE, filter->read_ok); - __set_bit(MODE_SENSE_10, filter->read_ok); - __set_bit(LOG_SENSE, filter->read_ok); - __set_bit(START_STOP, filter->read_ok); - __set_bit(GPCMD_VERIFY_10, filter->read_ok); - __set_bit(VERIFY_16, filter->read_ok); - __set_bit(REPORT_LUNS, filter->read_ok); - __set_bit(SERVICE_ACTION_IN, filter->read_ok); - __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); - __set_bit(MAINTENANCE_IN, filter->read_ok); - __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); - - /* Audio CD commands */ - __set_bit(GPCMD_PLAY_CD, filter->read_ok); - __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); - __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); - __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); - __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); - - /* CD/DVD data reading */ - __set_bit(GPCMD_READ_CD, filter->read_ok); - __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); - __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); - __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); - __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); - __set_bit(GPCMD_READ_HEADER, filter->read_ok); - __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); - __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); - __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); - __set_bit(GPCMD_REPORT_KEY, filter->read_ok); - __set_bit(GPCMD_SCAN, filter->read_ok); - __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); - __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); - __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); - __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); - __set_bit(GPCMD_SEEK, filter->read_ok); - __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); - - /* Basic writing commands */ - __set_bit(WRITE_6, filter->write_ok); - __set_bit(WRITE_10, filter->write_ok); - __set_bit(WRITE_VERIFY, filter->write_ok); - __set_bit(WRITE_12, filter->write_ok); - __set_bit(WRITE_VERIFY_12, filter->write_ok); - __set_bit(WRITE_16, filter->write_ok); - __set_bit(WRITE_LONG, filter->write_ok); - __set_bit(WRITE_LONG_2, filter->write_ok); - __set_bit(ERASE, filter->write_ok); - __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); - __set_bit(MODE_SELECT, filter->write_ok); - __set_bit(LOG_SELECT, filter->write_ok); - __set_bit(GPCMD_BLANK, filter->write_ok); - __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); - __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); - __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); - __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); - __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); - __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); - __set_bit(GPCMD_SEND_EVENT, filter->write_ok); - __set_bit(GPCMD_SEND_KEY, filter->write_ok); - __set_bit(GPCMD_SEND_OPC, filter->write_ok); - __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); - __set_bit(GPCMD_SET_SPEED, filter->write_ok); - __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); - __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); - __set_bit(GPCMD_SET_STREAMING, filter->write_ok); - __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); +#define sgio_bitmap_set(cmd, mask, rw) \ + filter->rw##_ok[(cmd)] |= (mask); + +#define D (1u << TYPE_DISK) /* Direct Access Block Device (SBC-3) */ +#define T (1u << TYPE_TAPE) /* Sequential Access Device (SSC-3) */ +#define L (1u << TYPE_PRINTER) /* Printer Device (SSC) */ +#define P (1u << TYPE_PROCESSOR) /* Processor Device (SPC-2) */ +#define W (1u << TYPE_WORM) /* Write Once Block Device (SBC) */ +#define R (1u << TYPE_ROM) /* C/DVD Device (MMC-6) */ +#define S (1u << TYPE_SCANNER) /* Scanner device (obsolete) */ +#define O (1u << TYPE_MOD) /* Optical Memory Block Device (SBC) */ +#define M (1u << TYPE_MEDIUM_CHANGER) /* Media Changer Device (SMC-3) */ +#define C (1u << TYPE_COMM) /* Communication devices (obsolete) */ +#define A (1u << TYPE_RAID) /* Storage Array Device (SCC-2) */ +#define E (1u << TYPE_ENCLOSURE) /* SCSI Enclosure Services device (SES-2) */ +#define B (1u << TYPE_RBC) /* Simplified Direct-Access (Reduced Block) device (RBC) */ +#define K (1u << 0x0f) /* Optical Card Reader/Writer device (OCRW) */ +#define V (1u << 0x10) /* Automation/Device Interface device (ADC-2) */ +#define F (1u << TYPE_OSD) /* Object-based Storage Device (OSD-2) */ + + /* control, universal except possibly RBC, read */ + + sgio_bitmap_set(0x00, -1 , read); // TEST UNIT READY + sgio_bitmap_set(0x03, -1 , read); // REQUEST SENSE + sgio_bitmap_set(0x12, -1 , read); // INQUIRY + sgio_bitmap_set(0x1A, -1 , read); // MODE SENSE(6) + sgio_bitmap_set(0x1C, ~B , read); // RECEIVE DIAGNOSTIC RESULTS + sgio_bitmap_set(0x4D, -1 , read); // LOG SENSE + sgio_bitmap_set(0x5A, -1 , read); // MODE SENSE(10) + sgio_bitmap_set(0x9E, -1 , read); // SERVICE ACTION IN(16) + sgio_bitmap_set(0xA0, -1 , read); // REPORT LUNS + + /* control, universal, write */ + + sgio_bitmap_set(0x4C, -1 , write); // LOG SELECT + sgio_bitmap_set(0x15, -1 , write); // MODE SELECT(6) + sgio_bitmap_set(0x55, -1 , write); // MODE SELECT(10) + + /* control, write */ + + sgio_bitmap_set(0x1B, D| W|R|O| A| B|K| F , write); // START STOP UNIT + sgio_bitmap_set(0x1E, D|T| W|R|O|M| K| F , write); // PREVENT ALLOW MEDIUM REMOVAL + + /* input */ + + sgio_bitmap_set(0x08, D|T| W| O , read); // READ(6) + sgio_bitmap_set(0x25, D| W|R|O| B|K , read); // READ CAPACITY(10) + sgio_bitmap_set(0x28, D| W|R|O| B|K , read); // READ(10) + sgio_bitmap_set(0x29, D| W|R|O , read); // READ GENERATION + sgio_bitmap_set(0x2D, O , read); // READ UPDATED BLOCK + sgio_bitmap_set(0x37, D| O , read); // READ DEFECT DATA(10) + sgio_bitmap_set(0x3E, D| W| O , read); // READ LONG(10) + sgio_bitmap_set(0x88, D|T| W| O| B , read); // READ(16) + sgio_bitmap_set(0x8F, D|T| W| O| B , read); // VERIFY(16) + sgio_bitmap_set(0x90, D| W| O| B , read); // PRE-FETCH(16) + sgio_bitmap_set(0xA8, D| W|R|O , read); // READ(12) + sgio_bitmap_set(0xAF, D| W| O , read); // VERIFY(12) + sgio_bitmap_set(0xB7, D| O , read); // READ DEFECT DATA(12) + + /* write */ + + sgio_bitmap_set(0x04, D| R|O , write); // FORMAT UNIT + sgio_bitmap_set(0x07, D| W| O , write); // REASSIGN BLOCKS + sgio_bitmap_set(0x0A, D|T| W| O , write); // WRITE(6) + sgio_bitmap_set(0x2A, D| W|R|O| B|K , write); // WRITE(10) + sgio_bitmap_set(0x2C, D| R|O , write); // ERASE(10) + sgio_bitmap_set(0x2E, D| W|R|O| B|K , write); // WRITE AND VERIFY(10) + sgio_bitmap_set(0x2F, D| W|R|O , write); // VERIFY(10) + sgio_bitmap_set(0x34, D| W| O| K , write); // PRE-FETCH(10) + sgio_bitmap_set(0x35, D| W|R|O| B|K , write); // SYNCHRONIZE CACHE(10) + sgio_bitmap_set(0x38, W| O| K , write); // MEDIUM SCAN + sgio_bitmap_set(0x3D, O , write); // UPDATE BLOCK + sgio_bitmap_set(0x3F, D| W| O , write); // WRITE LONG(10) + sgio_bitmap_set(0x41, D , write); // WRITE SAME(10) + sgio_bitmap_set(0x42, D , write); // UNMAP + sgio_bitmap_set(0x48, D| B , write); // SANITIZE + sgio_bitmap_set(0x51, D , write); // XPWRITE(10) + sgio_bitmap_set(0x53, D , write); // XDWRITEREAD(10) + sgio_bitmap_set(0x89, D , write); // COMPARE AND WRITE + sgio_bitmap_set(0x8B, D , write); // ORWRITE + sgio_bitmap_set(0x85, D| B , write); // ATA PASS-THROUGH(16) + sgio_bitmap_set(0x8A, D|T| W| O| B , write); // WRITE(16) + sgio_bitmap_set(0x8E, D| W| O| B , write); // WRITE AND VERIFY(16) + sgio_bitmap_set(0x91, D| W| O| B , write); // SYNCHRONIZE CACHE(16) + sgio_bitmap_set(0x93, D , write); // WRITE SAME(16) + sgio_bitmap_set(0xA1, D| B , write); // ATA PASS-THROUGH(12) + sgio_bitmap_set(0xAA, D| W|R|O , write); // WRITE(12) + sgio_bitmap_set(0xAC, O , write); // ERASE(12) + sgio_bitmap_set(0xAE, D| W| O , write); // WRITE AND VERIFY(12) + + /* processor device */ + + sgio_bitmap_set(0x08, P , read); // RECEIVE + sgio_bitmap_set(0x0A, P , write); // SEND(6) + + /* printer */ + + sgio_bitmap_set(0x04, L , write); // FORMAT + sgio_bitmap_set(0x0A, L , write); // PRINT + sgio_bitmap_set(0x0B, L , write); // SLEW AND PRINT + sgio_bitmap_set(0x10, L , write); // SYNCHRONIZE BUFFER + sgio_bitmap_set(0x1B, L , write); // STOP PRINT + + /* media changer */ + + sgio_bitmap_set(0x07, M , write); // INITIALIZE ELEMENT STATUS + sgio_bitmap_set(0x1B, M , write); // OPEN/CLOSE IMPORT/EXPORT ELEMENT + sgio_bitmap_set(0x2B, M , write); // POSITION TO ELEMENT + sgio_bitmap_set(0x37, M , write); // INITIALIZE ELEMENT STATUS WITH RANGE + sgio_bitmap_set(0xA6, M , write); // EXCHANGE MEDIUM + sgio_bitmap_set(0xB5, M , write); // REQUEST VOLUME ELEMENT ADDRESS + sgio_bitmap_set(0xB6, M , write); // SEND VOLUME TAG + + /* (mostly) MMC */ + + sgio_bitmap_set(0x23, R , read); // READ FORMAT CAPACITIES + sgio_bitmap_set(0x42, R , read); // READ SUB-CHANNEL + sgio_bitmap_set(0x43, R , read); // READ TOC/PMA/ATIP + sgio_bitmap_set(0x44, R , read); // READ HEADER + sgio_bitmap_set(0x45, R , read); // PLAY AUDIO(10) + sgio_bitmap_set(0x46, R , read); // GET CONFIGURATION + sgio_bitmap_set(0x47, R , read); // PLAY AUDIO MSF + sgio_bitmap_set(0x4A, R , read); // GET EVENT STATUS NOTIFICATION + sgio_bitmap_set(0x4B, R , read); // PAUSE/RESUME + sgio_bitmap_set(0x4E, R , read); // STOP PLAY/SCAN + sgio_bitmap_set(0x51, R , read); // READ DISC INFORMATION + sgio_bitmap_set(0x52, R , read); // READ TRACK INFORMATION + sgio_bitmap_set(0x5C, R , read); // READ BUFFER CAPACITY + sgio_bitmap_set(0xA4, R , read); // REPORT KEY + sgio_bitmap_set(0xA5, R , read); // PLAY AUDIO(12) + sgio_bitmap_set(0xAB, R| V , read); // SERVICE ACTION IN(12) + sgio_bitmap_set(0xAC, R , read); // GET PERFORMANCE + sgio_bitmap_set(0xAD, R , read); // READ DVD STRUCTURE + sgio_bitmap_set(0xB9, R , read); // READ CD MSF + sgio_bitmap_set(0xBA, R , read); // SCAN + sgio_bitmap_set(0xBD, R , read); // MECHANISM STATUS + sgio_bitmap_set(0xBE, R , read); // READ CD + + sgio_bitmap_set(0xB6, R , write); // SET STREAMING + sgio_bitmap_set(0x53, R , write); // RESERVE TRACK + sgio_bitmap_set(0x54, R , write); // SEND OPC INFORMATION + sgio_bitmap_set(0x58, R , write); // REPAIR TRACK + sgio_bitmap_set(0x5B, R , write); // CLOSE TRACK/SESSION + sgio_bitmap_set(0x5D, R , write); // SEND CUE SHEET + sgio_bitmap_set(0xA1, R , write); // BLANK + sgio_bitmap_set(0xA2, R , write); // SEND EVENT + sgio_bitmap_set(0xA3, R , write); // SEND KEY + sgio_bitmap_set(0xA6, R , write); // LOAD/UNLOAD C/DVD + sgio_bitmap_set(0xA7, R , write); // SET READ AHEAD + sgio_bitmap_set(0xBB, R , write); // SET CD SPEED + sgio_bitmap_set(0xBF, R , write); // SEND DVD STRUCTURE + + /* (mostly) tape */ + + sgio_bitmap_set(0x01, T , read); // REWIND + sgio_bitmap_set(0x05, T , read); // READ BLOCK LIMITS + sgio_bitmap_set(0x0F, T , read); // READ REVERSE(6) + sgio_bitmap_set(0x13, T , read); // VERIFY(6) + sgio_bitmap_set(0x2B, T , read); // LOCATE(10) + sgio_bitmap_set(0x34, T , read); // READ POSITION + sgio_bitmap_set(0x44, T| V , read); // REPORT DENSITY SUPPORT + sgio_bitmap_set(0x81, T , read); // READ REVERSE(16) + sgio_bitmap_set(0x92, T , read); // LOCATE(16) + + sgio_bitmap_set(0x04, T , write); // FORMAT MEDIUM + sgio_bitmap_set(0x0B, T , write); // SET CAPACITY + sgio_bitmap_set(0x10, T , write); // WRITE FILEMARKS(6) + sgio_bitmap_set(0x11, T , write); // SPACE(6) + sgio_bitmap_set(0x14, T|L , write); // RECOVER BUFFERED DATA + sgio_bitmap_set(0x19, T , write); // ERASE(6) + sgio_bitmap_set(0x1B, T| V , write); // LOAD UNLOAD + sgio_bitmap_set(0x80, T , write); // WRITE FILEMARKS(16) + sgio_bitmap_set(0x82, T , write); // ALLOW OVERWRITE + sgio_bitmap_set(0x91, T , write); // SPACE(16) + sgio_bitmap_set(0x93, T , write); // ERASE(16) + + /* various obsolete */ + + sgio_bitmap_set(0x0B, D| W|R|O , read); // SEEK(6) + sgio_bitmap_set(0x2B, D| W|R|O| K , read); // SEEK(10) + sgio_bitmap_set(0x30, D| W|R|O , read); // SEARCH DATA HIGH(10) + sgio_bitmap_set(0x31, D| W|R|O , read); // SEARCH DATA EQUAL(10) + sgio_bitmap_set(0x32, D| W|R|O , read); // SEARCH DATA LOW(10) + sgio_bitmap_set(0x39, D|T|L|P|W|R|O| K , read); // COMPARE + sgio_bitmap_set(0x52, D , read); // XDREAD(10) + sgio_bitmap_set(0xB0, W|R|O , read); // SEARCH DATA HIGH(12) + sgio_bitmap_set(0xB1, W|R|O , read); // SEARCH DATA EQUAL(12) + sgio_bitmap_set(0xB2, W|R|O , read); // SEARCH DATA LOW(12) + sgio_bitmap_set(0xB4, D|T| W|R|O , read); // READ ELEMENT STATUS ATTACHED + sgio_bitmap_set(0xB8, T| W|R|O|M , read); // READ ELEMENT STATUS + + sgio_bitmap_set(0x01, D| W|R|O|M , write); // REZERO UNIT + sgio_bitmap_set(0x18, D|T|L|P|W|R|O| K , write); // COPY + sgio_bitmap_set(0x3A, D|T|L|P|W|R|O| K , write); // COPY AND VERIFY + sgio_bitmap_set(0x50, D , write); // XDWRITE(10) + sgio_bitmap_set(0x80, D , write); // XDWRITE EXTENDED(16) + + /* communication devices (obsolete) */ + + sgio_bitmap_set(0x08, C , write); // GET MESSAGE(6) + sgio_bitmap_set(0x0A, C , write); // SEND MESSAGE(6) + sgio_bitmap_set(0x28, C , write); // GET MESSAGE(10) + sgio_bitmap_set(0x2A, C , write); // SEND MESSAGE(10) + sgio_bitmap_set(0xA8, C , write); // GET MESSAGE(12) + sgio_bitmap_set(0xAA, C , write); // SEND MESSAGE(12) + + /* scanners (obsolete) */ + + sgio_bitmap_set(0x1B, S, write); // SCAN + sgio_bitmap_set(0x24, S, write); // SET WINDOW + sgio_bitmap_set(0x25, S, write); // GET WINDOW + sgio_bitmap_set(0x2A, S, write); // SEND(10) + sgio_bitmap_set(0x31, S, write); // OBJECT POSITION + sgio_bitmap_set(0x34, S, write); // GET DATA BUFFER STATUS + +#if 0 + /* + * Starting from here are commands that are always privileged. + * I'm listing them anyway, as a reference to the version of + * the command list that I used. + */ + + /* control, privileged, universal except possibly RBC */ + + sgio_bitmap_set(0x1D, ~B , write); // SEND DIAGNOSTIC + sgio_bitmap_set(0x3B, -1 , write); // WRITE BUFFER + sgio_bitmap_set(0x3C, ~B , write); // READ BUFFER + + /* control, privileged */ + + sgio_bitmap_set(0x5E, D|T|L|P|W| O|M|A|E| F , write); // PERSISTENT RESERVE IN + sgio_bitmap_set(0x5F, D|T|L|P|W| O|M|A|E| F , write); // PERSISTENT RESERVE OUT + sgio_bitmap_set(0x83, D|T|L|P|W| O| K|V , write); // Third-party Copy OUT + sgio_bitmap_set(0x84, D|T|L|P|W| O| K|V , write); // Third-party Copy IN + sgio_bitmap_set(0x86, D|T| P|W| O|M|A|E|B|K|V , write); // ACCESS CONTROL IN + sgio_bitmap_set(0x87, D|T| P|W| O|M|A|E|B|K|V , write); // ACCESS CONTROL OUT + sgio_bitmap_set(0x8C, D|T| W| O|M| B| V , write); // READ ATTRIBUTE + sgio_bitmap_set(0x8D, D|T| W| O|M| B| V , write); // WRITE ATTRIBUTE + sgio_bitmap_set(0xA2, D|T| R| V , write); // SECURITY PROTOCOL IN + sgio_bitmap_set(0xA3, D|T|L| W| O|M|A|E|B|K|V , write); // MAINTENANCE IN + sgio_bitmap_set(0xA4, D|T|L| W| O|M|A|E|B|K|V , write); // MAINTENANCE OUT + sgio_bitmap_set(0xA9, V , write); // SERVICE ACTION OUT(12) + sgio_bitmap_set(0xB5, D|T| R| V , write); // SECURITY PROTOCOL OUT + sgio_bitmap_set(0xBA, D| W| O|M|A|E , write); // REDUNDANCY GROUP (IN) + sgio_bitmap_set(0xBB, D| W| O|M|A|E , write); // REDUNDANCY GROUP (OUT) + sgio_bitmap_set(0xBC, D| W| O|M|A|E , write); // SPARE (IN) + sgio_bitmap_set(0xBD, D| W| O|M|A|E , write); // SPARE (OUT) + sgio_bitmap_set(0xBE, D| W| O|M|A|E , write); // VOLUME SET (IN) + sgio_bitmap_set(0xBF, D| W| O|M|A|E , write); // VOLUME SET (OUT) + + /* control, privileged, obsolete */ + + sgio_bitmap_set(0x16, D|T|L|P|W| O|M|A|E| K , write); // RESERVE(6) + sgio_bitmap_set(0x16, M , write); // RESERVE ELEMENT(6) + sgio_bitmap_set(0x17, D|T|L|P|W| O|M|A|E| K , write); // RELEASE(6) + sgio_bitmap_set(0x17, M , write); // RELEASE ELEMENT(6) + sgio_bitmap_set(0x33, D| W|R|O , write); // SET LIMITS(10) + sgio_bitmap_set(0x36, D| W| O| K , write); // LOCK UNLOCK CACHE(10) + sgio_bitmap_set(0x40, D|T|L|P|W|R|O|M , write); // CHANGE DEFINITION + sgio_bitmap_set(0x56, D|T|L|P|W| O|M|A|E , write); // RESERVE(10) + sgio_bitmap_set(0x56, M , write); // RESERVE ELEMENT(10) + sgio_bitmap_set(0x57, D|T|L|P|W| O|M|A|E , write); // RELEASE(10) + sgio_bitmap_set(0x57, M , write); // RELEASE ELEMENT(10) + sgio_bitmap_set(0x81, D , write); // REBUILD(16) + sgio_bitmap_set(0x82, D , write); // REGENERATE(16) + sgio_bitmap_set(0x92, D| W| O , write); // LOCK UNLOCK CACHE(16) + sgio_bitmap_set(0xA5, T| W| O|M , write); // MOVE MEDIUM + sgio_bitmap_set(0xA7, D|T| W| O , write); // MOVE MEDIUM ATTACHED + sgio_bitmap_set(0xB3, D| W|R|O , write); // SET LIMITS(12) + + /* others: multiplexed */ + + sgio_bitmap_set(0x7E, D|T| R| M|A|E|B| V , write); // extended CDB + sgio_bitmap_set(0x7F, D| F , write); // variable length CDB + sgio_bitmap_set(0x9F, V , write); // SERVICE ACTION OUT(16) + + /* others: vendor specific */ + + sgio_bitmap_set(0x01, L , write); + sgio_bitmap_set(0x02, D|T|L|P|W|R| M , write); + sgio_bitmap_set(0x05, D| L|P|W|R| M , write); + sgio_bitmap_set(0x06, D|T|L|P|W|R| M , write); + sgio_bitmap_set(0x07, T|L , write); + sgio_bitmap_set(0x08, L| M , write); + sgio_bitmap_set(0x09, D|T|L|P|W|R| M , write); + sgio_bitmap_set(0x0A, M , write); + sgio_bitmap_set(0x0B, M , write); + sgio_bitmap_set(0x0C, D|T|L|P|W|R| M , write); + sgio_bitmap_set(0x0D, D|T|L|P|W|R| M , write); + sgio_bitmap_set(0x0E, D|T|L|P|W|R| M , write); + sgio_bitmap_set(0x0F, D| L|P|W|R| M , write); + sgio_bitmap_set(0x10, D| P|W|R , write); + sgio_bitmap_set(0x11, D| L|P|W|R , write); + sgio_bitmap_set(0x13, D| L|P|W|R , write); + sgio_bitmap_set(0x14, D| P|W|R , write); + sgio_bitmap_set(0x19, D| L|P|W|R , write); + sgio_bitmap_set(0x20, D| W|R|O| K , write); + sgio_bitmap_set(0x21, D| W|R|O| K , write); + sgio_bitmap_set(0x22, D| W|R|O| K , write); + sgio_bitmap_set(0x23, D| W| O| K , write); + sgio_bitmap_set(0x24, D| W|R , write); + sgio_bitmap_set(0x26, D| W|R , write); + sgio_bitmap_set(0x27, D| W|R , write); + sgio_bitmap_set(0x2D, D , write); + + /* others: reserved */ + + sgio_bitmap_set(0x1F, 0 , write); + sgio_bitmap_set(0x49, 0 , write); + sgio_bitmap_set(0x4F, 0 , write); + sgio_bitmap_set(0x59, 0 , write); + sgio_bitmap_set(0x98, 0 , write); + sgio_bitmap_set(0x99, 0 , write); + sgio_bitmap_set(0x9A, 0 , write); + sgio_bitmap_set(0x9B, 0 , write); + sgio_bitmap_set(0x9C, 0 , write); + sgio_bitmap_set(0x9D, 0 , write); // SERVICE ACTION BIDIRECTIONAL +#endif + +#undef D +#undef T +#undef L +#undef P +#undef W +#undef R +#undef S +#undef O +#undef M +#undef C +#undef A +#undef E +#undef B +#undef K +#undef V +#undef F +#undef sgio_bitmap_set } -int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm) +int blk_verify_command(struct request_queue *q, + unsigned char *cmd, fmode_t has_write_perm) { struct blk_cmd_filter *filter = &blk_default_cmd_filter; /* root can do any command. */ - if (capable(CAP_SYS_RAWIO)) + if (capable(CAP_SYS_RAWIO) || blk_queue_unpriv_sgio(q)) return 0; - /* if there's no filter set, assume we're filtering everything out */ - if (!filter) - return -EPERM; - /* Anybody who can open the device can do a read-safe command */ - if (test_bit(cmd[0], filter->read_ok)) + if (filter->read_ok[cmd[0]] & (1 << q->sgio_type)) return 0; /* Write-safe commands require a writable open */ - if (test_bit(cmd[0], filter->write_ok) && has_write_perm) + if (has_write_perm && filter->write_ok[cmd[0]] & (1 << q->sgio_type)) return 0; return -EPERM; @@ -222,7 +476,7 @@ static int blk_fill_sghdr_rq(struct requ { if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; - if (blk_verify_command(rq->cmd, mode & FMODE_WRITE)) + if (blk_verify_command(q, rq->cmd, mode & FMODE_WRITE)) return -EPERM; /* @@ -309,7 +563,7 @@ static int sg_io(struct request_queue *q rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL); if (!rq) - return -ENOMEM; + return -ENODEV; if (blk_fill_sghdr_rq(q, rq, hdr, mode)) { blk_put_request(rq); @@ -441,6 +695,10 @@ int sg_scsi_ioctl(struct request_queue * } rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); + if (!rq) { + kfree(buffer); + return -ENODEV; + } cmdlen = COMMAND_SIZE(opcode); @@ -455,7 +713,7 @@ int sg_scsi_ioctl(struct request_queue * if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) goto error; - err = blk_verify_command(rq->cmd, mode & FMODE_WRITE); + err = blk_verify_command(q, rq->cmd, mode & FMODE_WRITE); if (err) goto error; @@ -527,6 +785,8 @@ static int __blk_send_generic(struct req int err; rq = blk_get_request(q, WRITE, __GFP_WAIT); + if (!rq) + return -ENODEV; rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = BLK_DEFAULT_SG_TIMEOUT; rq->cmd[0] = cmd; @@ -549,7 +809,7 @@ int scsi_cmd_ioctl(struct request_queue { int err; - if (!q || blk_get_queue(q)) + if (!q) return -ENXIO; switch (cmd) { @@ -670,11 +930,51 @@ int scsi_cmd_ioctl(struct request_queue err = -ENOTTY; } - blk_put_queue(q); return err; } EXPORT_SYMBOL(scsi_cmd_ioctl); +int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) +{ + if (bd && bd == bd->bd_contains) + return 0; + + /* Actually none of this is particularly useful on a partition + * device, but let's play it safe. + */ + switch (cmd) { + case SCSI_IOCTL_GET_IDLUN: + case SCSI_IOCTL_GET_BUS_NUMBER: + case SCSI_IOCTL_GET_PCI: + case SCSI_IOCTL_PROBE_HOST: + case SG_GET_VERSION_NUM: + case SG_SET_TIMEOUT: + case SG_GET_TIMEOUT: + case SG_GET_RESERVED_SIZE: + case SG_SET_RESERVED_SIZE: + case SG_EMULATED_HOST: + return 0; + default: + break; + } + /* In particular, rule out all resets and host-specific ioctls. */ + return -ENOTTY; +} +EXPORT_SYMBOL(scsi_verify_blk_ioctl); + +int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, + unsigned int cmd, void __user *arg) +{ + int ret; + + ret = scsi_verify_blk_ioctl(bd, cmd); + if (ret < 0) + return ret; + + return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); +} +EXPORT_SYMBOL(scsi_cmd_blk_ioctl); + int __init blk_scsi_ioctl_init(void) { blk_set_cmd_filter_defaults(&blk_default_cmd_filter); diff -uprN linux-2.6.32/check-kabi linux-2.6.32.ovz/check-kabi --- linux-2.6.32/check-kabi 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/check-kabi 2015-06-23 04:16:15.000000000 -0700 @@ -0,0 +1,146 @@ +#!/usr/bin/python +# +# check-kabi - Red Hat kABI reference checking tool +# +# We use this script to check against reference Module.kabi files. +# +# Author: Jon Masters +# Copyright (C) 2007-2009 Red Hat, Inc. +# +# This software may be freely redistributed under the terms of the GNU +# General Public License (GPL). + +# Changelog: +# +# 2009/08/15 - Updated for use in RHEL6. +# 2007/06/13 - Initial rewrite in python by Jon Masters. + +__author__ = "Jon Masters " +__version__ = "2.0" +__date__ = "2009/08/15" +__copyright__ = "Copyright (C) 2007-2009 Red Hat, Inc" +__license__ = "GPL" + +import getopt +import os +import re +import string +import sys + +true = 1 +false = 0 + +def load_symvers(symvers,filename): + """Load a Module.symvers file.""" + + symvers_file = open(filename,"r") + + while true: + in_line = symvers_file.readline() + if in_line == "": + break + if in_line == "\n": + continue + checksum,symbol,directory,type = string.split(in_line) + + symvers[symbol] = in_line[0:-1] + +def load_kabi(kabi,filename): + """Load a Module.kabi file.""" + + kabi_file = open(filename,"r") + + while true: + in_line = kabi_file.readline() + if in_line == "": + break + if in_line == "\n": + continue + checksum,symbol,directory,type = string.split(in_line) + + kabi[symbol] = in_line[0:-1] + +def check_kabi(symvers,kabi): + """Check Module.kabi and Module.symvers files.""" + + fail=0 + warn=0 + changed_symbols=[] + moved_symbols=[] + + for symbol in kabi: + abi_hash,abi_sym,abi_dir,abi_type = string.split(kabi[symbol]) + if symvers.has_key(symbol): + sym_hash,sym_sym,sym_dir,sym_type = string.split(symvers[symbol]) + if abi_hash != sym_hash: + fail=1 + changed_symbols.append(symbol) + + if abi_dir != sym_dir: + warn=1 + moved_symbols.append(symbol) + else: + fail=1 + changed_symbols.append(symbol) + + if fail: + print "*** ERROR - ABI BREAKAGE WAS DETECTED ***" + print "" + print "The following symbols have been changed (this will cause an ABI breakage):" + print "" + for symbol in changed_symbols: + print symbol + print "" + + if warn: + print "*** WARNING - ABI SYMBOLS MOVED ***" + print "" + print "The following symbols moved (typically caused by moving a symbol from being" + print "provided by the kernel vmlinux out to a loadable module):" + print "" + for symbol in moved_symbols: + print symbol + print "" + + """Halt the build, if we got errors and/or warnings. In either case, + double-checkig is required to avoid introducing / concealing + KABI inconsistencies.""" + if fail or warn: + sys.exit(1) + sys.exit(0) + +def usage(): + print """ +check-kabi: check Module.kabi and Module.symvers files. + + check-kabi [ -k Module.kabi ] [ -s Module.symvers ] + +""" + +if __name__ == "__main__": + + symvers_file = "" + kabi_file = "" + + opts, args = getopt.getopt(sys.argv[1:], 'hk:s:') + + for o, v in opts: + if o == "-s": + symvers_file = v + if o == "-h": + usage() + sys.exit(0) + if o == "-k": + kabi_file = v + + if (symvers_file == "") or (kabi_file == ""): + usage() + sys.exit(1) + + symvers={} + kabi={} + + load_symvers(symvers,symvers_file) + load_kabi(kabi,kabi_file) + check_kabi(symvers,kabi) + diff -uprN linux-2.6.32/COPYING.Parallels linux-2.6.32.ovz/COPYING.Parallels --- linux-2.6.32/COPYING.Parallels 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/COPYING.Parallels 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,350 @@ + +Nothing in this license should be construed as a grant by Parallels of any rights +beyond the rights specified in the GNU General Public License, and nothing in +this license should be construed as a waiver by Parallels of its patent, copyright +and/or trademark rights, beyond the waiver required by the GNU General Public +License. This license is expressly inapplicable to any product that is not +within the scope of the GNU General Public License + +---------------------------------------- + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff -uprN linux-2.6.32/crypto/algapi.c linux-2.6.32.ovz/crypto/algapi.c --- linux-2.6.32/crypto/algapi.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/algapi.c 2015-03-10 13:24:13.000000000 -0700 @@ -25,22 +25,6 @@ static void crypto_remove_final(struct l static LIST_HEAD(crypto_template_list); -void crypto_larval_error(const char *name, u32 type, u32 mask) -{ - struct crypto_alg *alg; - - alg = crypto_alg_lookup(name, type, mask); - - if (alg) { - if (crypto_is_larval(alg)) { - struct crypto_larval *larval = (void *)alg; - complete_all(&larval->completion); - } - crypto_mod_put(alg); - } -} -EXPORT_SYMBOL_GPL(crypto_larval_error); - static inline int crypto_set_driver_name(struct crypto_alg *alg) { static const char suffix[] = "-generic"; @@ -58,8 +42,20 @@ static inline int crypto_set_driver_name return 0; } +static inline void crypto_check_module_sig(struct module *mod) +{ +#ifdef CONFIG_CRYPTO_FIPS + if (fips_enabled && mod && !mod->gpgsig_ok) + panic("Module %s signature verification failed in FIPS mode\n", + mod->name); +#endif + return; +} + static int crypto_check_alg(struct crypto_alg *alg) { + crypto_check_module_sig(alg->cra_module); + if (alg->cra_alignmask & (alg->cra_alignmask + 1)) return -EINVAL; @@ -296,7 +292,6 @@ found: continue; larval->adult = alg; - complete_all(&larval->completion); continue; } @@ -405,6 +400,41 @@ int crypto_unregister_alg(struct crypto_ } EXPORT_SYMBOL_GPL(crypto_unregister_alg); +int crypto_register_algs(struct crypto_alg *algs, int count) +{ + int i, ret; + + for (i = 0; i < count; i++) { + ret = crypto_register_alg(&algs[i]); + if (ret) + goto err; + } + + return 0; + +err: + for (--i; i >= 0; --i) + crypto_unregister_alg(&algs[i]); + + return ret; +} +EXPORT_SYMBOL_GPL(crypto_register_algs); + +int crypto_unregister_algs(struct crypto_alg *algs, int count) +{ + int i, ret; + + for (i = 0; i < count; i++) { + ret = crypto_unregister_alg(&algs[i]); + if (ret) + pr_err("Failed to unregister %s %s: %d\n", + algs[i].cra_driver_name, algs[i].cra_name, ret); + } + + return 0; +} +EXPORT_SYMBOL_GPL(crypto_unregister_algs); + int crypto_register_template(struct crypto_template *tmpl) { struct crypto_template *q; @@ -412,6 +442,8 @@ int crypto_register_template(struct cryp down_write(&crypto_alg_sem); + crypto_check_module_sig(tmpl->module); + list_for_each_entry(q, &crypto_template_list, list) { if (q == tmpl) goto out; @@ -419,6 +451,7 @@ int crypto_register_template(struct cryp list_add(&tmpl->list, &crypto_template_list); crypto_notify(CRYPTO_MSG_TMPL_REGISTER, tmpl); + err = 0; out: up_write(&crypto_alg_sem); diff -uprN linux-2.6.32/crypto/algboss.c linux-2.6.32.ovz/crypto/algboss.c --- linux-2.6.32/crypto/algboss.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/algboss.c 2015-03-10 13:23:57.000000000 -0700 @@ -11,6 +11,7 @@ */ #include +#include #include #include #include @@ -43,9 +44,10 @@ struct cryptomgr_param { } nu32; } attrs[CRYPTO_MAX_ATTRS]; - char larval[CRYPTO_MAX_ALG_NAME]; char template[CRYPTO_MAX_ALG_NAME]; + struct crypto_larval *larval; + u32 otype; u32 omask; }; @@ -65,7 +67,7 @@ static int cryptomgr_probe(void *data) tmpl = crypto_lookup_template(param->template); if (!tmpl) - goto err; + goto out; do { if (tmpl->create) { @@ -82,16 +84,11 @@ static int cryptomgr_probe(void *data) crypto_tmpl_put(tmpl); - if (err) - goto err; - out: + complete_all(¶m->larval->completion); + crypto_alg_put(¶m->larval->alg); kfree(param); module_put_and_exit(0); - -err: - crypto_larval_error(param->larval, param->otype, param->omask); - goto out; } static int cryptomgr_schedule_probe(struct crypto_larval *larval) @@ -189,14 +186,19 @@ static int cryptomgr_schedule_probe(stru param->otype = larval->alg.cra_flags; param->omask = larval->mask; - memcpy(param->larval, larval->alg.cra_name, CRYPTO_MAX_ALG_NAME); + crypto_alg_get(&larval->alg); + param->larval = larval; thread = kthread_run(cryptomgr_probe, param, "cryptomgr_probe"); if (IS_ERR(thread)) - goto err_free_param; + goto err_put_larval; + + wait_for_completion_interruptible(&larval->completion); return NOTIFY_STOP; +err_put_larval: + crypto_alg_put(&larval->alg); err_free_param: kfree(param); err_put_module: diff -uprN linux-2.6.32/crypto/ansi_cprng.c linux-2.6.32.ovz/crypto/ansi_cprng.c --- linux-2.6.32/crypto/ansi_cprng.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/ansi_cprng.c 2015-03-10 13:24:04.000000000 -0700 @@ -85,7 +85,7 @@ static void xor_vectors(unsigned char *i * Returns DEFAULT_BLK_SZ bytes of random data per call * returns 0 if generation succeded, <0 if something went wrong */ -static int _get_more_prng_bytes(struct prng_context *ctx) +static int _get_more_prng_bytes(struct prng_context *ctx, int cont_test) { int i; unsigned char tmp[DEFAULT_BLK_SZ]; @@ -132,7 +132,7 @@ static int _get_more_prng_bytes(struct p */ if (!memcmp(ctx->rand_data, ctx->last_rand_data, DEFAULT_BLK_SZ)) { - if (fips_enabled) { + if (cont_test) { panic("cprng %p Failed repetition check!\n", ctx); } @@ -185,16 +185,14 @@ static int _get_more_prng_bytes(struct p } /* Our exported functions */ -static int get_prng_bytes(char *buf, size_t nbytes, struct prng_context *ctx) +static int get_prng_bytes(char *buf, size_t nbytes, struct prng_context *ctx, + int do_cont_test) { unsigned char *ptr = buf; unsigned int byte_count = (unsigned int)nbytes; int err; - if (nbytes < 0) - return -EINVAL; - spin_lock_bh(&ctx->prng_lock); err = -EINVAL; @@ -220,7 +218,7 @@ static int get_prng_bytes(char *buf, siz remainder: if (ctx->rand_data_valid == DEFAULT_BLK_SZ) { - if (_get_more_prng_bytes(ctx) < 0) { + if (_get_more_prng_bytes(ctx, do_cont_test) < 0) { memset(buf, 0, nbytes); err = -EINVAL; goto done; @@ -232,11 +230,11 @@ remainder: */ if (byte_count < DEFAULT_BLK_SZ) { empty_rbuf: - for (; ctx->rand_data_valid < DEFAULT_BLK_SZ; - ctx->rand_data_valid++) { + while (ctx->rand_data_valid < DEFAULT_BLK_SZ) { *ptr = ctx->rand_data[ctx->rand_data_valid]; ptr++; byte_count--; + ctx->rand_data_valid++; if (byte_count == 0) goto done; } @@ -247,7 +245,7 @@ empty_rbuf: */ for (; byte_count >= DEFAULT_BLK_SZ; byte_count -= DEFAULT_BLK_SZ) { if (ctx->rand_data_valid == DEFAULT_BLK_SZ) { - if (_get_more_prng_bytes(ctx) < 0) { + if (_get_more_prng_bytes(ctx, do_cont_test) < 0) { memset(buf, 0, nbytes); err = -EINVAL; goto done; @@ -356,7 +354,7 @@ static int cprng_get_random(struct crypt { struct prng_context *prng = crypto_rng_ctx(tfm); - return get_prng_bytes(rdata, dlen, prng); + return get_prng_bytes(rdata, dlen, prng, 0); } /* @@ -404,19 +402,87 @@ static struct crypto_alg rng_alg = { } }; +#ifdef CONFIG_CRYPTO_FIPS +static int fips_cprng_get_random(struct crypto_rng *tfm, u8 *rdata, + unsigned int dlen) +{ + struct prng_context *prng = crypto_rng_ctx(tfm); + + return get_prng_bytes(rdata, dlen, prng, 1); +} + +static int fips_cprng_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen) +{ + u8 rdata[DEFAULT_BLK_SZ]; + u8 *key = seed + DEFAULT_BLK_SZ; + int rc; + + struct prng_context *prng = crypto_rng_ctx(tfm); + + if (slen < DEFAULT_PRNG_KSZ + DEFAULT_BLK_SZ) + return -EINVAL; + + /* fips strictly requires seed != key */ + if (!memcmp(seed, key, DEFAULT_PRNG_KSZ)) + return -EINVAL; + + rc = cprng_reset(tfm, seed, slen); + + if (!rc) + goto out; + + /* this primes our continuity test */ + rc = get_prng_bytes(rdata, DEFAULT_BLK_SZ, prng, 0); + prng->rand_data_valid = DEFAULT_BLK_SZ; + +out: + return rc; +} + +static struct crypto_alg fips_rng_alg = { + .cra_name = "fips(ansi_cprng)", + .cra_driver_name = "fips_ansi_cprng", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_RNG, + .cra_ctxsize = sizeof(struct prng_context), + .cra_type = &crypto_rng_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(rng_alg.cra_list), + .cra_init = cprng_init, + .cra_exit = cprng_exit, + .cra_u = { + .rng = { + .rng_make_random = fips_cprng_get_random, + .rng_reset = fips_cprng_reset, + .seedsize = DEFAULT_PRNG_KSZ + 2*DEFAULT_BLK_SZ, + } + } +}; +#endif /* Module initalization */ static int __init prng_mod_init(void) { - if (fips_enabled) - rng_alg.cra_priority += 200; + int rc = 0; + + rc = crypto_register_alg(&rng_alg); +#ifdef CONFIG_CRYPTO_FIPS + if (rc) + goto out; - return crypto_register_alg(&rng_alg); + rc = crypto_register_alg(&fips_rng_alg); + +out: +#endif + return rc; } static void __exit prng_mod_fini(void) { crypto_unregister_alg(&rng_alg); +#ifdef CONFIG_CRYPTO_FIPS + crypto_unregister_alg(&fips_rng_alg); +#endif return; } diff -uprN linux-2.6.32/crypto/api.c linux-2.6.32.ovz/crypto/api.c --- linux-2.6.32/crypto/api.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/api.c 2015-03-10 13:24:04.000000000 -0700 @@ -34,11 +34,7 @@ EXPORT_SYMBOL_GPL(crypto_alg_sem); BLOCKING_NOTIFIER_HEAD(crypto_chain); EXPORT_SYMBOL_GPL(crypto_chain); -static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) -{ - atomic_inc(&alg->cra_refcnt); - return alg; -} +static struct crypto_alg *crypto_larval_wait(struct crypto_alg *alg); struct crypto_alg *crypto_mod_get(struct crypto_alg *alg) { @@ -150,8 +146,11 @@ static struct crypto_alg *crypto_larval_ } up_write(&crypto_alg_sem); - if (alg != &larval->alg) + if (alg != &larval->alg) { kfree(larval); + if (crypto_is_larval(alg)) + alg = crypto_larval_wait(alg); + } return alg; } diff -uprN linux-2.6.32/crypto/async_tx/async_memcpy.c linux-2.6.32.ovz/crypto/async_tx/async_memcpy.c --- linux-2.6.32/crypto/async_tx/async_memcpy.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/async_tx/async_memcpy.c 2015-03-10 13:24:00.000000000 -0700 @@ -66,6 +66,12 @@ async_memcpy(struct page *dest, struct p tx = device->device_prep_dma_memcpy(chan, dma_dest, dma_src, len, dma_prep_flags); + if (!tx) { + dma_unmap_page(device->dev, dma_dest, len, + DMA_FROM_DEVICE); + dma_unmap_page(device->dev, dma_src, len, + DMA_TO_DEVICE); + } } if (tx) { @@ -83,8 +89,8 @@ async_memcpy(struct page *dest, struct p memcpy(dest_buf, src_buf, len); - kunmap_atomic(dest_buf, KM_USER0); kunmap_atomic(src_buf, KM_USER1); + kunmap_atomic(dest_buf, KM_USER0); async_tx_sync_epilog(submit); } diff -uprN linux-2.6.32/crypto/async_tx/async_memset.c linux-2.6.32.ovz/crypto/async_tx/async_memset.c --- linux-2.6.32/crypto/async_tx/async_memset.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/async_tx/async_memset.c 2015-03-10 13:24:00.000000000 -0700 @@ -25,6 +25,7 @@ */ #include #include +#include #include #include #include diff -uprN linux-2.6.32/crypto/async_tx/async_raid6_recov.c linux-2.6.32.ovz/crypto/async_tx/async_raid6_recov.c --- linux-2.6.32/crypto/async_tx/async_raid6_recov.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/async_tx/async_raid6_recov.c 2015-03-10 13:21:38.000000000 -0700 @@ -324,6 +324,7 @@ struct dma_async_tx_descriptor * async_raid6_2data_recov(int disks, size_t bytes, int faila, int failb, struct page **blocks, struct async_submit_ctl *submit) { + void *scribble = submit->scribble; int non_zero_srcs, i; BUG_ON(faila == failb); @@ -332,11 +333,13 @@ async_raid6_2data_recov(int disks, size_ pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes); - /* we need to preserve the contents of 'blocks' for the async - * case, so punt to synchronous if a scribble buffer is not available + /* if a dma resource is not available or a scribble buffer is not + * available punt to the synchronous path. In the 'dma not + * available' case be sure to use the scribble buffer to + * preserve the content of 'blocks' as the caller intended. */ - if (!submit->scribble) { - void **ptrs = (void **) blocks; + if (!async_dma_find_channel(DMA_PQ) || !scribble) { + void **ptrs = scribble ? scribble : (void **) blocks; async_tx_quiesce(&submit->depend_tx); for (i = 0; i < disks; i++) @@ -406,11 +409,13 @@ async_raid6_datap_recov(int disks, size_ pr_debug("%s: disks: %d len: %zu\n", __func__, disks, bytes); - /* we need to preserve the contents of 'blocks' for the async - * case, so punt to synchronous if a scribble buffer is not available + /* if a dma resource is not available or a scribble buffer is not + * available punt to the synchronous path. In the 'dma not + * available' case be sure to use the scribble buffer to + * preserve the content of 'blocks' as the caller intended. */ - if (!scribble) { - void **ptrs = (void **) blocks; + if (!async_dma_find_channel(DMA_PQ) || !scribble) { + void **ptrs = scribble ? scribble : (void **) blocks; async_tx_quiesce(&submit->depend_tx); for (i = 0; i < disks; i++) diff -uprN linux-2.6.32/crypto/async_tx/async_tx.c linux-2.6.32.ovz/crypto/async_tx/async_tx.c --- linux-2.6.32/crypto/async_tx/async_tx.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/async_tx/async_tx.c 2015-03-10 13:24:00.000000000 -0700 @@ -134,8 +134,8 @@ async_tx_channel_switch(struct dma_async } device->device_issue_pending(chan); } else { - if (dma_wait_for_async_tx(depend_tx) == DMA_ERROR) - panic("%s: DMA_ERROR waiting for depend_tx\n", + if (dma_wait_for_async_tx(depend_tx) != DMA_SUCCESS) + panic("%s: DMA error waiting for depend_tx\n", __func__); tx->tx_submit(tx); } @@ -287,8 +287,9 @@ void async_tx_quiesce(struct dma_async_t * we are referring to the correct operation */ BUG_ON(async_tx_test_ack(*tx)); - if (dma_wait_for_async_tx(*tx) == DMA_ERROR) - panic("DMA_ERROR waiting for transaction\n"); + if (dma_wait_for_async_tx(*tx) != DMA_SUCCESS) + panic("%s: DMA error waiting for transaction\n", + __func__); async_tx_ack(*tx); *tx = NULL; } diff -uprN linux-2.6.32/crypto/async_tx/Kconfig linux-2.6.32.ovz/crypto/async_tx/Kconfig --- linux-2.6.32/crypto/async_tx/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/async_tx/Kconfig 2015-03-10 13:22:09.000000000 -0700 @@ -22,6 +22,7 @@ config ASYNC_RAID6_RECOV tristate select ASYNC_CORE select ASYNC_PQ + select ASYNC_XOR config ASYNC_TX_DISABLE_PQ_VAL_DMA bool diff -uprN linux-2.6.32/crypto/authenc.c linux-2.6.32.ovz/crypto/authenc.c --- linux-2.6.32/crypto/authenc.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/authenc.c 2015-03-10 13:23:55.000000000 -0700 @@ -46,6 +46,12 @@ struct authenc_request_ctx { char tail[]; }; +static void authenc_request_complete(struct aead_request *req, int err) +{ + if (err != -EINPROGRESS) + aead_request_complete(req, err); +} + static int crypto_authenc_setkey(struct crypto_aead *authenc, const u8 *key, unsigned int keylen) { @@ -101,20 +107,6 @@ badkey: goto out; } -static void authenc_chain(struct scatterlist *head, struct scatterlist *sg, - int chain) -{ - if (chain) { - head->length += sg->length; - sg = scatterwalk_sg_next(sg); - } - - if (sg) - scatterwalk_sg_chain(head, 2, sg); - else - sg_mark_end(head); -} - static void authenc_geniv_ahash_update_done(struct crypto_async_request *areq, int err) { @@ -142,7 +134,7 @@ static void authenc_geniv_ahash_update_d crypto_aead_authsize(authenc), 1); out: - aead_request_complete(req, err); + authenc_request_complete(req, err); } static void authenc_geniv_ahash_done(struct crypto_async_request *areq, int err) @@ -208,7 +200,7 @@ static void authenc_verify_ahash_update_ err = crypto_ablkcipher_decrypt(abreq); out: - aead_request_complete(req, err); + authenc_request_complete(req, err); } static void authenc_verify_ahash_done(struct crypto_async_request *areq, @@ -245,7 +237,7 @@ static void authenc_verify_ahash_done(st err = crypto_ablkcipher_decrypt(abreq); out: - aead_request_complete(req, err); + authenc_request_complete(req, err); } static u8 *crypto_authenc_ahash_fb(struct aead_request *req, unsigned int flags) @@ -335,7 +327,7 @@ static int crypto_authenc_genicv(struct if (ivsize) { sg_init_table(cipher, 2); sg_set_buf(cipher, iv, ivsize); - authenc_chain(cipher, dst, vdst == iv + ivsize); + scatterwalk_crypto_chain(cipher, dst, vdst == iv + ivsize, 2); dst = cipher; cryptlen += ivsize; } @@ -344,7 +336,7 @@ static int crypto_authenc_genicv(struct authenc_ahash_fn = crypto_authenc_ahash; sg_init_table(asg, 2); sg_set_page(asg, sg_page(assoc), assoc->length, assoc->offset); - authenc_chain(asg, dst, 0); + scatterwalk_crypto_chain(asg, dst, 0, 2); dst = asg; cryptlen += req->assoclen; } @@ -379,7 +371,7 @@ static void crypto_authenc_encrypt_done( err = crypto_authenc_genicv(areq, iv, 0); } - aead_request_complete(areq, err); + authenc_request_complete(areq, err); } static int crypto_authenc_encrypt(struct aead_request *req) @@ -418,7 +410,7 @@ static void crypto_authenc_givencrypt_do err = crypto_authenc_genicv(areq, greq->giv, 0); } - aead_request_complete(areq, err); + authenc_request_complete(areq, err); } static int crypto_authenc_givencrypt(struct aead_givcrypt_request *req) @@ -487,7 +479,7 @@ static int crypto_authenc_iverify(struct if (ivsize) { sg_init_table(cipher, 2); sg_set_buf(cipher, iv, ivsize); - authenc_chain(cipher, src, vsrc == iv + ivsize); + scatterwalk_crypto_chain(cipher, src, vsrc == iv + ivsize, 2); src = cipher; cryptlen += ivsize; } @@ -496,7 +488,7 @@ static int crypto_authenc_iverify(struct authenc_ahash_fn = crypto_authenc_ahash; sg_init_table(asg, 2); sg_set_page(asg, sg_page(assoc), assoc->length, assoc->offset); - authenc_chain(asg, src, 0); + scatterwalk_crypto_chain(asg, src, 0, 2); src = asg; cryptlen += req->assoclen; } diff -uprN linux-2.6.32/crypto/blkcipher.c linux-2.6.32.ovz/crypto/blkcipher.c --- linux-2.6.32/crypto/blkcipher.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/blkcipher.c 2015-03-10 13:22:09.000000000 -0700 @@ -89,9 +89,9 @@ static inline unsigned int blkcipher_don memcpy(walk->dst.virt.addr, walk->page, n); blkcipher_unmap_dst(walk); } else if (!(walk->flags & BLKCIPHER_WALK_PHYS)) { - blkcipher_unmap_src(walk); if (walk->flags & BLKCIPHER_WALK_DIFF) blkcipher_unmap_dst(walk); + blkcipher_unmap_src(walk); } scatterwalk_advance(&walk->in, n); diff -uprN linux-2.6.32/crypto/crc32c.c linux-2.6.32.ovz/crypto/crc32c.c --- linux-2.6.32/crypto/crc32c.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/crc32c.c 2015-06-23 04:16:16.000000000 -0700 @@ -141,6 +141,12 @@ static u32 crc32c(u32 crc, const u8 *dat return crc; } +u32 crc32c_generic(u32 crc, const void *address, unsigned int length) +{ + return crc32c(crc, address, length); +} +EXPORT_SYMBOL(crc32c_generic); + /* * Steps through buffer one byte at at time, calculates reflected * crc using table. diff -uprN linux-2.6.32/crypto/crct10dif.c linux-2.6.32.ovz/crypto/crct10dif.c --- linux-2.6.32/crypto/crct10dif.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/crct10dif.c 2015-03-10 13:24:13.000000000 -0700 @@ -0,0 +1,178 @@ +/* + * Cryptographic API. + * + * T10 Data Integrity Field CRC16 Crypto Transform + * + * Copyright (c) 2007 Oracle Corporation. All rights reserved. + * Written by Martin K. Petersen + * Copyright (C) 2013 Intel Corporation + * Author: Tim Chen + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +struct chksum_desc_ctx { + __u16 crc; +}; + +/* Table generated using the following polynomium: + * x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1 + * gt: 0x8bb7 + */ +static const __u16 t10_dif_crc_table[256] = { + 0x0000, 0x8BB7, 0x9CD9, 0x176E, 0xB205, 0x39B2, 0x2EDC, 0xA56B, + 0xEFBD, 0x640A, 0x7364, 0xF8D3, 0x5DB8, 0xD60F, 0xC161, 0x4AD6, + 0x54CD, 0xDF7A, 0xC814, 0x43A3, 0xE6C8, 0x6D7F, 0x7A11, 0xF1A6, + 0xBB70, 0x30C7, 0x27A9, 0xAC1E, 0x0975, 0x82C2, 0x95AC, 0x1E1B, + 0xA99A, 0x222D, 0x3543, 0xBEF4, 0x1B9F, 0x9028, 0x8746, 0x0CF1, + 0x4627, 0xCD90, 0xDAFE, 0x5149, 0xF422, 0x7F95, 0x68FB, 0xE34C, + 0xFD57, 0x76E0, 0x618E, 0xEA39, 0x4F52, 0xC4E5, 0xD38B, 0x583C, + 0x12EA, 0x995D, 0x8E33, 0x0584, 0xA0EF, 0x2B58, 0x3C36, 0xB781, + 0xD883, 0x5334, 0x445A, 0xCFED, 0x6A86, 0xE131, 0xF65F, 0x7DE8, + 0x373E, 0xBC89, 0xABE7, 0x2050, 0x853B, 0x0E8C, 0x19E2, 0x9255, + 0x8C4E, 0x07F9, 0x1097, 0x9B20, 0x3E4B, 0xB5FC, 0xA292, 0x2925, + 0x63F3, 0xE844, 0xFF2A, 0x749D, 0xD1F6, 0x5A41, 0x4D2F, 0xC698, + 0x7119, 0xFAAE, 0xEDC0, 0x6677, 0xC31C, 0x48AB, 0x5FC5, 0xD472, + 0x9EA4, 0x1513, 0x027D, 0x89CA, 0x2CA1, 0xA716, 0xB078, 0x3BCF, + 0x25D4, 0xAE63, 0xB90D, 0x32BA, 0x97D1, 0x1C66, 0x0B08, 0x80BF, + 0xCA69, 0x41DE, 0x56B0, 0xDD07, 0x786C, 0xF3DB, 0xE4B5, 0x6F02, + 0x3AB1, 0xB106, 0xA668, 0x2DDF, 0x88B4, 0x0303, 0x146D, 0x9FDA, + 0xD50C, 0x5EBB, 0x49D5, 0xC262, 0x6709, 0xECBE, 0xFBD0, 0x7067, + 0x6E7C, 0xE5CB, 0xF2A5, 0x7912, 0xDC79, 0x57CE, 0x40A0, 0xCB17, + 0x81C1, 0x0A76, 0x1D18, 0x96AF, 0x33C4, 0xB873, 0xAF1D, 0x24AA, + 0x932B, 0x189C, 0x0FF2, 0x8445, 0x212E, 0xAA99, 0xBDF7, 0x3640, + 0x7C96, 0xF721, 0xE04F, 0x6BF8, 0xCE93, 0x4524, 0x524A, 0xD9FD, + 0xC7E6, 0x4C51, 0x5B3F, 0xD088, 0x75E3, 0xFE54, 0xE93A, 0x628D, + 0x285B, 0xA3EC, 0xB482, 0x3F35, 0x9A5E, 0x11E9, 0x0687, 0x8D30, + 0xE232, 0x6985, 0x7EEB, 0xF55C, 0x5037, 0xDB80, 0xCCEE, 0x4759, + 0x0D8F, 0x8638, 0x9156, 0x1AE1, 0xBF8A, 0x343D, 0x2353, 0xA8E4, + 0xB6FF, 0x3D48, 0x2A26, 0xA191, 0x04FA, 0x8F4D, 0x9823, 0x1394, + 0x5942, 0xD2F5, 0xC59B, 0x4E2C, 0xEB47, 0x60F0, 0x779E, 0xFC29, + 0x4BA8, 0xC01F, 0xD771, 0x5CC6, 0xF9AD, 0x721A, 0x6574, 0xEEC3, + 0xA415, 0x2FA2, 0x38CC, 0xB37B, 0x1610, 0x9DA7, 0x8AC9, 0x017E, + 0x1F65, 0x94D2, 0x83BC, 0x080B, 0xAD60, 0x26D7, 0x31B9, 0xBA0E, + 0xF0D8, 0x7B6F, 0x6C01, 0xE7B6, 0x42DD, 0xC96A, 0xDE04, 0x55B3 +}; + +__u16 crc_t10dif_generic(__u16 crc, const unsigned char *buffer, size_t len) +{ + unsigned int i; + + for (i = 0 ; i < len ; i++) + crc = (crc << 8) ^ t10_dif_crc_table[((crc >> 8) ^ buffer[i]) & 0xff]; + + return crc; +} +EXPORT_SYMBOL(crc_t10dif_generic); + +/* + * Steps through buffer one byte at at time, calculates reflected + * crc using table. + */ + +static int chksum_init(struct shash_desc *desc) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = 0; + + return 0; +} + +static int chksum_update(struct shash_desc *desc, const u8 *data, + unsigned int length) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + ctx->crc = crc_t10dif_generic(ctx->crc, data, length); + return 0; +} + +static int chksum_final(struct shash_desc *desc, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + *(__u16 *)out = ctx->crc; + return 0; +} + +static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, + u8 *out) +{ + *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + return 0; +} + +static int chksum_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, len, out); +} + +static int chksum_digest(struct shash_desc *desc, const u8 *data, + unsigned int length, u8 *out) +{ + struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); + + return __chksum_finup(&ctx->crc, data, length, out); +} + +static struct shash_alg alg = { + .digestsize = CRC_T10DIF_DIGEST_SIZE, + .init = chksum_init, + .update = chksum_update, + .final = chksum_final, + .finup = chksum_finup, + .digest = chksum_digest, + .descsize = sizeof(struct chksum_desc_ctx), + .base = { + .cra_name = "crct10dif", + .cra_driver_name = "crct10dif-generic", + .cra_priority = 100, + .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +static int __init crct10dif_mod_init(void) +{ + int ret; + + ret = crypto_register_shash(&alg); + return ret; +} + +static void __exit crct10dif_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(crct10dif_mod_init); +module_exit(crct10dif_mod_fini); + +MODULE_AUTHOR("Tim Chen "); +MODULE_DESCRIPTION("T10 DIF CRC calculation."); +MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/crypto/cryptd.c linux-2.6.32.ovz/crypto/cryptd.c --- linux-2.6.32/crypto/cryptd.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/cryptd.c 2015-03-10 13:23:55.000000000 -0700 @@ -3,6 +3,13 @@ * * Copyright (c) 2006 Herbert Xu * + * Added AEAD support to cryptd. + * Authors: Tadeusz Struk (tadeusz.struk@intel.com) + * Adrian Hoban + * Gabriele Paoloni + * Aidan O'Mahony (aidan.o.mahony@intel.com) + * Copyright (c) 2010, Intel Corporation. + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) @@ -12,6 +19,7 @@ #include #include +#include #include #include #include @@ -44,6 +52,11 @@ struct hashd_instance_ctx { struct cryptd_queue *queue; }; +struct aead_instance_ctx { + struct crypto_aead_spawn aead_spawn; + struct cryptd_queue *queue; +}; + struct cryptd_blkcipher_ctx { struct crypto_blkcipher *child; }; @@ -61,6 +74,14 @@ struct cryptd_hash_request_ctx { struct shash_desc desc; }; +struct cryptd_aead_ctx { + struct crypto_aead *child; +}; + +struct cryptd_aead_request_ctx { + crypto_completion_t complete; +}; + static void cryptd_queue_worker(struct work_struct *work); static int cryptd_init_queue(struct cryptd_queue *queue, @@ -116,13 +137,18 @@ static void cryptd_queue_worker(struct w struct crypto_async_request *req, *backlog; cpu_queue = container_of(work, struct cryptd_cpu_queue, work); - /* Only handle one request at a time to avoid hogging crypto - * workqueue. preempt_disable/enable is used to prevent - * being preempted by cryptd_enqueue_request() */ + /* + * Only handle one request at a time to avoid hogging crypto workqueue. + * preempt_disable/enable is used to prevent being preempted by + * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent + * cryptd_enqueue_request() being accessed from software interrupts. + */ + local_bh_disable(); preempt_disable(); backlog = crypto_get_backlog(&cpu_queue->queue); req = crypto_dequeue_request(&cpu_queue->queue); preempt_enable(); + local_bh_enable(); if (!req) return; @@ -601,6 +627,144 @@ out_put_alg: return err; } +static void cryptd_aead_crypt(struct aead_request *req, + struct crypto_aead *child, + int err, + int (*crypt)(struct aead_request *req)) +{ + struct cryptd_aead_request_ctx *rctx; + rctx = aead_request_ctx(req); + + if (unlikely(err == -EINPROGRESS)) + goto out; + aead_request_set_tfm(req, child); + err = crypt( req ); + req->base.complete = rctx->complete; +out: + local_bh_disable(); + rctx->complete(&req->base, err); + local_bh_enable(); +} + +static void cryptd_aead_encrypt(struct crypto_async_request *areq, int err) +{ + struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(areq->tfm); + struct crypto_aead *child = ctx->child; + struct aead_request *req; + + req = container_of(areq, struct aead_request, base); + cryptd_aead_crypt(req, child, err, crypto_aead_crt(child)->encrypt); +} + +static void cryptd_aead_decrypt(struct crypto_async_request *areq, int err) +{ + struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(areq->tfm); + struct crypto_aead *child = ctx->child; + struct aead_request *req; + + req = container_of(areq, struct aead_request, base); + cryptd_aead_crypt(req, child, err, crypto_aead_crt(child)->decrypt); +} + +static int cryptd_aead_enqueue(struct aead_request *req, + crypto_completion_t complete) +{ + struct cryptd_aead_request_ctx *rctx = aead_request_ctx(req); + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct cryptd_queue *queue = cryptd_get_queue(crypto_aead_tfm(tfm)); + + rctx->complete = req->base.complete; + req->base.complete = complete; + return cryptd_enqueue_request(queue, &req->base); +} + +static int cryptd_aead_encrypt_enqueue(struct aead_request *req) +{ + return cryptd_aead_enqueue(req, cryptd_aead_encrypt ); +} + +static int cryptd_aead_decrypt_enqueue(struct aead_request *req) +{ + return cryptd_aead_enqueue(req, cryptd_aead_decrypt ); +} + +static int cryptd_aead_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_instance *inst = crypto_tfm_alg_instance(tfm); + struct aead_instance_ctx *ictx = crypto_instance_ctx(inst); + struct crypto_aead_spawn *spawn = &ictx->aead_spawn; + struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_aead *cipher; + + cipher = crypto_spawn_aead(spawn); + if (IS_ERR(cipher)) + return PTR_ERR(cipher); + + crypto_aead_set_flags(cipher, CRYPTO_TFM_REQ_MAY_SLEEP); + ctx->child = cipher; + tfm->crt_aead.reqsize = sizeof(struct cryptd_aead_request_ctx); + return 0; +} + +static void cryptd_aead_exit_tfm(struct crypto_tfm *tfm) +{ + struct cryptd_aead_ctx *ctx = crypto_tfm_ctx(tfm); + crypto_free_aead(ctx->child); +} + +static int cryptd_create_aead(struct crypto_template *tmpl, + struct rtattr **tb, + struct cryptd_queue *queue) +{ + struct aead_instance_ctx *ctx; + struct crypto_instance *inst; + struct crypto_alg *alg; + int err; + + alg = crypto_get_attr_alg(tb, CRYPTO_ALG_TYPE_AEAD, + CRYPTO_ALG_TYPE_MASK); + if (IS_ERR(alg)) + return PTR_ERR(alg); + + inst = cryptd_alloc_instance(alg, 0, sizeof(*ctx)); + err = PTR_ERR(inst); + if (IS_ERR(inst)) + goto out_put_alg; + + ctx = crypto_instance_ctx(inst); + ctx->queue = queue; + + err = crypto_init_spawn(&ctx->aead_spawn.base, alg, inst, + CRYPTO_ALG_TYPE_MASK | CRYPTO_ALG_ASYNC); + if (err) + goto out_free_inst; + + inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC; + inst->alg.cra_type = alg->cra_type; + inst->alg.cra_ctxsize = sizeof(struct cryptd_aead_ctx); + inst->alg.cra_init = cryptd_aead_init_tfm; + inst->alg.cra_exit = cryptd_aead_exit_tfm; + inst->alg.cra_aead.setkey = alg->cra_aead.setkey; + inst->alg.cra_aead.setauthsize = alg->cra_aead.setauthsize; + inst->alg.cra_aead.geniv = alg->cra_aead.geniv; + inst->alg.cra_aead.ivsize = alg->cra_aead.ivsize; + inst->alg.cra_aead.maxauthsize = alg->cra_aead.maxauthsize; + inst->alg.cra_aead.encrypt = cryptd_aead_encrypt_enqueue; + inst->alg.cra_aead.decrypt = cryptd_aead_decrypt_enqueue; + inst->alg.cra_aead.givencrypt = alg->cra_aead.givencrypt; + inst->alg.cra_aead.givdecrypt = alg->cra_aead.givdecrypt; + + err = crypto_register_instance(tmpl, inst); + if (err) { + crypto_drop_spawn(&ctx->aead_spawn.base); +out_free_inst: + kfree(inst); + } +out_put_alg: + crypto_mod_put(alg); + return err; +} + static struct cryptd_queue queue; static int cryptd_create(struct crypto_template *tmpl, struct rtattr **tb) @@ -616,6 +780,8 @@ static int cryptd_create(struct crypto_t return cryptd_create_blkcipher(tmpl, tb, &queue); case CRYPTO_ALG_TYPE_DIGEST: return cryptd_create_hash(tmpl, tb, &queue); + case CRYPTO_ALG_TYPE_AEAD: + return cryptd_create_aead(tmpl, tb, &queue); } return -EINVAL; @@ -625,16 +791,21 @@ static void cryptd_free(struct crypto_in { struct cryptd_instance_ctx *ctx = crypto_instance_ctx(inst); struct hashd_instance_ctx *hctx = crypto_instance_ctx(inst); + struct aead_instance_ctx *aead_ctx = crypto_instance_ctx(inst); switch (inst->alg.cra_flags & CRYPTO_ALG_TYPE_MASK) { case CRYPTO_ALG_TYPE_AHASH: crypto_drop_shash(&hctx->spawn); kfree(ahash_instance(inst)); return; + case CRYPTO_ALG_TYPE_AEAD: + crypto_drop_spawn(&aead_ctx->aead_spawn.base); + kfree(inst); + return; + default: + crypto_drop_spawn(&ctx->spawn); + kfree(inst); } - - crypto_drop_spawn(&ctx->spawn); - kfree(inst); } static struct crypto_template cryptd_tmpl = { @@ -711,12 +882,53 @@ struct crypto_shash *cryptd_ahash_child( } EXPORT_SYMBOL_GPL(cryptd_ahash_child); +struct shash_desc *cryptd_shash_desc(struct ahash_request *req) +{ + struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req); + return &rctx->desc; +} +EXPORT_SYMBOL_GPL(cryptd_shash_desc); + void cryptd_free_ahash(struct cryptd_ahash *tfm) { crypto_free_ahash(&tfm->base); } EXPORT_SYMBOL_GPL(cryptd_free_ahash); +struct cryptd_aead *cryptd_alloc_aead(const char *alg_name, + u32 type, u32 mask) +{ + char cryptd_alg_name[CRYPTO_MAX_ALG_NAME]; + struct crypto_aead *tfm; + + if (snprintf(cryptd_alg_name, CRYPTO_MAX_ALG_NAME, + "cryptd(%s)", alg_name) >= CRYPTO_MAX_ALG_NAME) + return ERR_PTR(-EINVAL); + tfm = crypto_alloc_aead(cryptd_alg_name, type, mask); + if (IS_ERR(tfm)) + return ERR_CAST(tfm); + if (tfm->base.__crt_alg->cra_module != THIS_MODULE) { + crypto_free_aead(tfm); + return ERR_PTR(-EINVAL); + } + return __cryptd_aead_cast(tfm); +} +EXPORT_SYMBOL_GPL(cryptd_alloc_aead); + +struct crypto_aead *cryptd_aead_child(struct cryptd_aead *tfm) +{ + struct cryptd_aead_ctx *ctx; + ctx = crypto_aead_ctx(&tfm->base); + return ctx->child; +} +EXPORT_SYMBOL_GPL(cryptd_aead_child); + +void cryptd_free_aead(struct cryptd_aead *tfm) +{ + crypto_free_aead(&tfm->base); +} +EXPORT_SYMBOL_GPL(cryptd_free_aead); + static int __init cryptd_init(void) { int err; @@ -738,7 +950,7 @@ static void __exit cryptd_exit(void) crypto_unregister_template(&cryptd_tmpl); } -module_init(cryptd_init); +subsys_initcall(cryptd_init); module_exit(cryptd_exit); MODULE_LICENSE("GPL"); diff -uprN linux-2.6.32/crypto/ctr.c linux-2.6.32.ovz/crypto/ctr.c --- linux-2.6.32/crypto/ctr.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/ctr.c 2015-03-10 13:23:55.000000000 -0700 @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -25,10 +26,15 @@ struct crypto_ctr_ctx { }; struct crypto_rfc3686_ctx { - struct crypto_blkcipher *child; + struct crypto_ablkcipher *child; u8 nonce[CTR_RFC3686_NONCE_SIZE]; }; +struct crypto_rfc3686_req_ctx { + u8 iv[CTR_RFC3686_BLOCK_SIZE]; + struct ablkcipher_request subreq CRYPTO_MINALIGN_ATTR; +}; + static int crypto_ctr_setkey(struct crypto_tfm *parent, const u8 *key, unsigned int keylen) { @@ -243,11 +249,11 @@ static struct crypto_template crypto_ctr .module = THIS_MODULE, }; -static int crypto_rfc3686_setkey(struct crypto_tfm *parent, const u8 *key, - unsigned int keylen) +static int crypto_rfc3686_setkey(struct crypto_ablkcipher *parent, + const u8 *key, unsigned int keylen) { - struct crypto_rfc3686_ctx *ctx = crypto_tfm_ctx(parent); - struct crypto_blkcipher *child = ctx->child; + struct crypto_rfc3686_ctx *ctx = crypto_ablkcipher_ctx(parent); + struct crypto_ablkcipher *child = ctx->child; int err; /* the nonce is stored in bytes at end of key */ @@ -259,59 +265,64 @@ static int crypto_rfc3686_setkey(struct keylen -= CTR_RFC3686_NONCE_SIZE; - crypto_blkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_blkcipher_set_flags(child, crypto_tfm_get_flags(parent) & - CRYPTO_TFM_REQ_MASK); - err = crypto_blkcipher_setkey(child, key, keylen); - crypto_tfm_set_flags(parent, crypto_blkcipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); + crypto_ablkcipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_ablkcipher_set_flags(child, crypto_ablkcipher_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_ablkcipher_setkey(child, key, keylen); + crypto_ablkcipher_set_flags(parent, crypto_ablkcipher_get_flags(child) & + CRYPTO_TFM_RES_MASK); return err; } -static int crypto_rfc3686_crypt(struct blkcipher_desc *desc, - struct scatterlist *dst, - struct scatterlist *src, unsigned int nbytes) +static int crypto_rfc3686_crypt(struct ablkcipher_request *req) { - struct crypto_blkcipher *tfm = desc->tfm; - struct crypto_rfc3686_ctx *ctx = crypto_blkcipher_ctx(tfm); - struct crypto_blkcipher *child = ctx->child; - unsigned long alignmask = crypto_blkcipher_alignmask(tfm); - u8 ivblk[CTR_RFC3686_BLOCK_SIZE + alignmask]; - u8 *iv = PTR_ALIGN(ivblk + 0, alignmask + 1); - u8 *info = desc->info; - int err; + struct crypto_ablkcipher *tfm = crypto_ablkcipher_reqtfm(req); + struct crypto_rfc3686_ctx *ctx = crypto_ablkcipher_ctx(tfm); + struct crypto_ablkcipher *child = ctx->child; + unsigned long align = crypto_ablkcipher_alignmask(tfm); + struct crypto_rfc3686_req_ctx *rctx = + (void *)PTR_ALIGN((u8 *)ablkcipher_request_ctx(req), align + 1); + struct ablkcipher_request *subreq = &rctx->subreq; + u8 *iv = rctx->iv; /* set up counter block */ memcpy(iv, ctx->nonce, CTR_RFC3686_NONCE_SIZE); - memcpy(iv + CTR_RFC3686_NONCE_SIZE, info, CTR_RFC3686_IV_SIZE); + memcpy(iv + CTR_RFC3686_NONCE_SIZE, req->info, CTR_RFC3686_IV_SIZE); /* initialize counter portion of counter block */ *(__be32 *)(iv + CTR_RFC3686_NONCE_SIZE + CTR_RFC3686_IV_SIZE) = cpu_to_be32(1); - desc->tfm = child; - desc->info = iv; - err = crypto_blkcipher_encrypt_iv(desc, dst, src, nbytes); - desc->tfm = tfm; - desc->info = info; + ablkcipher_request_set_tfm(subreq, child); + ablkcipher_request_set_callback(subreq, req->base.flags, + req->base.complete, req->base.data); + ablkcipher_request_set_crypt(subreq, req->src, req->dst, req->nbytes, + iv); - return err; + return crypto_ablkcipher_encrypt(subreq); } static int crypto_rfc3686_init_tfm(struct crypto_tfm *tfm) { struct crypto_instance *inst = (void *)tfm->__crt_alg; - struct crypto_spawn *spawn = crypto_instance_ctx(inst); + struct crypto_skcipher_spawn *spawn = crypto_instance_ctx(inst); struct crypto_rfc3686_ctx *ctx = crypto_tfm_ctx(tfm); - struct crypto_blkcipher *cipher; + struct crypto_ablkcipher *cipher; + unsigned long align; - cipher = crypto_spawn_blkcipher(spawn); + cipher = crypto_spawn_skcipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; + align = crypto_tfm_alg_alignmask(tfm); + align &= ~(crypto_tfm_ctx_alignment() - 1); + tfm->crt_ablkcipher.reqsize = align + + sizeof(struct crypto_rfc3686_req_ctx) + + crypto_ablkcipher_reqsize(cipher); + return 0; } @@ -319,74 +330,110 @@ static void crypto_rfc3686_exit_tfm(stru { struct crypto_rfc3686_ctx *ctx = crypto_tfm_ctx(tfm); - crypto_free_blkcipher(ctx->child); + crypto_free_ablkcipher(ctx->child); } static struct crypto_instance *crypto_rfc3686_alloc(struct rtattr **tb) { + struct crypto_attr_type *algt; struct crypto_instance *inst; struct crypto_alg *alg; + struct crypto_skcipher_spawn *spawn; + const char *cipher_name; int err; - err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_BLKCIPHER); - if (err) + algt = crypto_get_attr_type(tb); + err = PTR_ERR(algt); + if (IS_ERR(algt)) return ERR_PTR(err); - alg = crypto_attr_alg(tb[1], CRYPTO_ALG_TYPE_BLKCIPHER, - CRYPTO_ALG_TYPE_MASK); - err = PTR_ERR(alg); - if (IS_ERR(alg)) + if ((algt->type ^ CRYPTO_ALG_TYPE_BLKCIPHER) & algt->mask) + return ERR_PTR(-EINVAL); + + cipher_name = crypto_attr_alg_name(tb[1]); + err = PTR_ERR(cipher_name); + if (IS_ERR(cipher_name)) return ERR_PTR(err); + inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); + if (!inst) + return ERR_PTR(-ENOMEM); + + spawn = crypto_instance_ctx(inst); + + crypto_set_skcipher_spawn(spawn, inst); + err = crypto_grab_skcipher(spawn, cipher_name, 0, + crypto_requires_sync(algt->type, + algt->mask)); + if (err) + goto err_free_inst; + + alg = crypto_skcipher_spawn_alg(spawn); + /* We only support 16-byte blocks. */ err = -EINVAL; - if (alg->cra_blkcipher.ivsize != CTR_RFC3686_BLOCK_SIZE) - goto out_put_alg; + if (alg->cra_ablkcipher.ivsize != CTR_RFC3686_BLOCK_SIZE) + goto err_drop_spawn; /* Not a stream cipher? */ if (alg->cra_blocksize != 1) - goto out_put_alg; + goto err_drop_spawn; - inst = crypto_alloc_instance("rfc3686", alg); - if (IS_ERR(inst)) - goto out; + err = -ENAMETOOLONG; + if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, "rfc3686(%s)", + alg->cra_name) >= CRYPTO_MAX_ALG_NAME) + goto err_drop_spawn; + if (snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, + "rfc3686(%s)", alg->cra_driver_name) >= + CRYPTO_MAX_ALG_NAME) + goto err_drop_spawn; - inst->alg.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER; inst->alg.cra_priority = alg->cra_priority; inst->alg.cra_blocksize = 1; inst->alg.cra_alignmask = alg->cra_alignmask; - inst->alg.cra_type = &crypto_blkcipher_type; - inst->alg.cra_blkcipher.ivsize = CTR_RFC3686_IV_SIZE; - inst->alg.cra_blkcipher.min_keysize = alg->cra_blkcipher.min_keysize - + CTR_RFC3686_NONCE_SIZE; - inst->alg.cra_blkcipher.max_keysize = alg->cra_blkcipher.max_keysize - + CTR_RFC3686_NONCE_SIZE; - - inst->alg.cra_blkcipher.geniv = "seqiv"; + inst->alg.cra_flags = CRYPTO_ALG_TYPE_ABLKCIPHER | + (alg->cra_flags & CRYPTO_ALG_ASYNC); + inst->alg.cra_type = &crypto_ablkcipher_type; + + inst->alg.cra_ablkcipher.ivsize = CTR_RFC3686_IV_SIZE; + inst->alg.cra_ablkcipher.min_keysize = + alg->cra_ablkcipher.min_keysize + CTR_RFC3686_NONCE_SIZE; + inst->alg.cra_ablkcipher.max_keysize = + alg->cra_ablkcipher.max_keysize + CTR_RFC3686_NONCE_SIZE; + + inst->alg.cra_ablkcipher.geniv = "seqiv"; + + inst->alg.cra_ablkcipher.setkey = crypto_rfc3686_setkey; + inst->alg.cra_ablkcipher.encrypt = crypto_rfc3686_crypt; + inst->alg.cra_ablkcipher.decrypt = crypto_rfc3686_crypt; inst->alg.cra_ctxsize = sizeof(struct crypto_rfc3686_ctx); inst->alg.cra_init = crypto_rfc3686_init_tfm; inst->alg.cra_exit = crypto_rfc3686_exit_tfm; - inst->alg.cra_blkcipher.setkey = crypto_rfc3686_setkey; - inst->alg.cra_blkcipher.encrypt = crypto_rfc3686_crypt; - inst->alg.cra_blkcipher.decrypt = crypto_rfc3686_crypt; - -out: - crypto_mod_put(alg); return inst; -out_put_alg: - inst = ERR_PTR(err); - goto out; +err_drop_spawn: + crypto_drop_skcipher(spawn); +err_free_inst: + kfree(inst); + return ERR_PTR(err); +} + +static void crypto_rfc3686_free(struct crypto_instance *inst) +{ + struct crypto_skcipher_spawn *spawn = crypto_instance_ctx(inst); + + crypto_drop_skcipher(spawn); + kfree(inst); } static struct crypto_template crypto_rfc3686_tmpl = { .name = "rfc3686", .alloc = crypto_rfc3686_alloc, - .free = crypto_ctr_free, + .free = crypto_rfc3686_free, .module = THIS_MODULE, }; diff -uprN linux-2.6.32/crypto/drbg.c linux-2.6.32.ovz/crypto/drbg.c --- linux-2.6.32/crypto/drbg.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/drbg.c 2015-03-10 13:24:14.000000000 -0700 @@ -0,0 +1,2044 @@ +/* + * DRBG: Deterministic Random Bits Generator + * Based on NIST Recommended DRBG from NIST SP800-90A with the following + * properties: + * * CTR DRBG with DF with AES-128, AES-192, AES-256 cores + * * Hash DRBG with DF with SHA-1, SHA-256, SHA-384, SHA-512 cores + * * HMAC DRBG with DF with SHA-1, SHA-256, SHA-384, SHA-512 cores + * * with and without prediction resistance + * + * Copyright Stephan Mueller , 2014 + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU General Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF + * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * DRBG Usage + * ========== + * The SP 800-90A DRBG allows the user to specify a personalization string + * for initialization as well as an additional information string for each + * random number request. The following code fragments show how a caller + * uses the kernel crypto API to use the full functionality of the DRBG. + * + * Usage without any additional data + * --------------------------------- + * struct crypto_rng *drng; + * int err; + * char data[DATALEN]; + * + * drng = crypto_alloc_rng(drng_name, 0, 0); + * err = crypto_rng_get_bytes(drng, &data, DATALEN); + * crypto_free_rng(drng); + * + * + * Usage with personalization string during initialization + * ------------------------------------------------------- + * struct crypto_rng *drng; + * int err; + * char data[DATALEN]; + * struct drbg_string pers; + * char personalization[11] = "some-string"; + * + * drbg_string_fill(&pers, personalization, strlen(personalization)); + * drng = crypto_alloc_rng(drng_name, 0, 0); + * // The reset completely re-initializes the DRBG with the provided + * // personalization string + * err = crypto_rng_reset(drng, &personalization, strlen(personalization)); + * err = crypto_rng_get_bytes(drng, &data, DATALEN); + * crypto_free_rng(drng); + * + * + * Usage with additional information string during random number request + * --------------------------------------------------------------------- + * struct crypto_rng *drng; + * int err; + * char data[DATALEN]; + * char addtl_string[11] = "some-string"; + * string drbg_string addtl; + * + * drbg_string_fill(&addtl, addtl_string, strlen(addtl_string)); + * drng = crypto_alloc_rng(drng_name, 0, 0); + * // The following call is a wrapper to crypto_rng_get_bytes() and returns + * // the same error codes. + * err = crypto_drbg_get_bytes_addtl(drng, &data, DATALEN, &addtl); + * crypto_free_rng(drng); + * + * + * Usage with personalization and additional information strings + * ------------------------------------------------------------- + * Just mix both scenarios above. + */ + +#include + +/*************************************************************** + * Backend cipher definitions available to DRBG + ***************************************************************/ + +/* + * The order of the DRBG definitions here matter: every DRBG is registered + * as stdrng. Each DRBG receives an increasing cra_priority values the later + * they are defined in this array (see drbg_fill_array). + * + * HMAC DRBGs are favored over Hash DRBGs over CTR DRBGs, and + * the SHA256 / AES 256 over other ciphers. Thus, the favored + * DRBGs are the latest entries in this array. + */ +static const struct drbg_core drbg_cores[] = { +#ifdef CONFIG_CRYPTO_DRBG_CTR + { + .flags = DRBG_CTR | DRBG_STRENGTH128, + .statelen = 32, /* 256 bits as defined in 10.2.1 */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 16, + .cra_name = "ctr_aes128", + .backend_cra_name = "ecb(aes)", + }, { + .flags = DRBG_CTR | DRBG_STRENGTH192, + .statelen = 40, /* 320 bits as defined in 10.2.1 */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 16, + .cra_name = "ctr_aes192", + .backend_cra_name = "ecb(aes)", + }, { + .flags = DRBG_CTR | DRBG_STRENGTH256, + .statelen = 48, /* 384 bits as defined in 10.2.1 */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 16, + .cra_name = "ctr_aes256", + .backend_cra_name = "ecb(aes)", + }, +#endif /* CONFIG_CRYPTO_DRBG_CTR */ +#ifdef CONFIG_CRYPTO_DRBG_HASH + { + .flags = DRBG_HASH | DRBG_STRENGTH128, + .statelen = 55, /* 440 bits */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 20, + .cra_name = "sha1", + .backend_cra_name = "sha1", + }, { + .flags = DRBG_HASH | DRBG_STRENGTH256, + .statelen = 111, /* 888 bits */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 48, + .cra_name = "sha384", + .backend_cra_name = "sha384", + }, { + .flags = DRBG_HASH | DRBG_STRENGTH256, + .statelen = 111, /* 888 bits */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 64, + .cra_name = "sha512", + .backend_cra_name = "sha512", + }, { + .flags = DRBG_HASH | DRBG_STRENGTH256, + .statelen = 55, /* 440 bits */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 32, + .cra_name = "sha256", + .backend_cra_name = "sha256", + }, +#endif /* CONFIG_CRYPTO_DRBG_HASH */ +#ifdef CONFIG_CRYPTO_DRBG_HMAC + { + .flags = DRBG_HMAC | DRBG_STRENGTH128, + .statelen = 20, /* block length of cipher */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 20, + .cra_name = "hmac_sha1", + .backend_cra_name = "hmac(sha1)", + }, { + .flags = DRBG_HMAC | DRBG_STRENGTH256, + .statelen = 48, /* block length of cipher */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 48, + .cra_name = "hmac_sha384", + .backend_cra_name = "hmac(sha384)", + }, { + .flags = DRBG_HMAC | DRBG_STRENGTH256, + .statelen = 64, /* block length of cipher */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 64, + .cra_name = "hmac_sha512", + .backend_cra_name = "hmac(sha512)", + }, { + .flags = DRBG_HMAC | DRBG_STRENGTH256, + .statelen = 32, /* block length of cipher */ + .max_addtllen = 35, + .max_bits = 19, + .max_req = 48, + .blocklen_bytes = 32, + .cra_name = "hmac_sha256", + .backend_cra_name = "hmac(sha256)", + }, +#endif /* CONFIG_CRYPTO_DRBG_HMAC */ +}; + +/****************************************************************** + * Generic helper functions + ******************************************************************/ + +/* + * Return strength of DRBG according to SP800-90A section 8.4 + * + * @flags DRBG flags reference + * + * Return: normalized strength in *bytes* value or 32 as default + * to counter programming errors + */ +static inline unsigned short drbg_sec_strength(drbg_flag_t flags) +{ + switch (flags & DRBG_STRENGTH_MASK) { + case DRBG_STRENGTH128: + return 16; + case DRBG_STRENGTH192: + return 24; + case DRBG_STRENGTH256: + return 32; + default: + return 32; + } +} + +/* + * FIPS 140-2 continuous self test + * The test is performed on the result of one round of the output + * function. Thus, the function implicitly knows the size of the + * buffer. + * + * The FIPS test can be called in an endless loop until it returns + * true. Although the code looks like a potential for a deadlock, it + * is not the case, because returning a false cannot mathematically + * occur (except once when a reseed took place and the updated state + * would is now set up such that the generation of new value returns + * an identical one -- this is most unlikely and would happen only once). + * Thus, if this function repeatedly returns false and thus would cause + * a deadlock, the integrity of the entire kernel is lost. + * + * @drbg DRBG handle + * @buf output buffer of random data to be checked + * + * return: + * true on success + * false on error + */ +static bool drbg_fips_continuous_test(struct drbg_state *drbg, + const unsigned char *buf) +{ +#ifdef CONFIG_CRYPTO_FIPS + int ret = 0; + /* skip test if we test the overall system */ + if (drbg->test_data) + return true; + /* only perform test in FIPS mode */ + if (0 == fips_enabled) + return true; + if (!drbg->fips_primed) { + /* Priming of FIPS test */ + memcpy(drbg->prev, buf, drbg_blocklen(drbg)); + drbg->fips_primed = true; + /* return false due to priming, i.e. another round is needed */ + return false; + } + ret = memcmp(drbg->prev, buf, drbg_blocklen(drbg)); + memcpy(drbg->prev, buf, drbg_blocklen(drbg)); + /* the test shall pass when the two compared values are not equal */ + return ret != 0; +#else + return true; +#endif /* CONFIG_CRYPTO_FIPS */ +} + +/* + * Convert an integer into a byte representation of this integer. + * The byte representation is big-endian + * + * @buf buffer holding the converted integer + * @val value to be converted + * @buflen length of buffer + */ +#if (defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_CTR)) +static inline void drbg_int2byte(unsigned char *buf, uint64_t val, + size_t buflen) +{ + unsigned char *byte; + uint64_t i; + + byte = buf + (buflen - 1); + for (i = 0; i < buflen; i++) + *(byte--) = val >> (i * 8) & 0xff; +} + +/* + * Increment buffer + * + * @dst buffer to increment + * @add value to add + */ +static inline void drbg_add_buf(unsigned char *dst, size_t dstlen, + const unsigned char *add, size_t addlen) +{ + /* implied: dstlen > addlen */ + unsigned char *dstptr; + const unsigned char *addptr; + unsigned int remainder = 0; + size_t len = addlen; + + dstptr = dst + (dstlen-1); + addptr = add + (addlen-1); + while (len) { + remainder += *dstptr + *addptr; + *dstptr = remainder & 0xff; + remainder >>= 8; + len--; dstptr--; addptr--; + } + len = dstlen - addlen; + while (len && remainder > 0) { + remainder = *dstptr + 1; + *dstptr = remainder & 0xff; + remainder >>= 8; + len--; dstptr--; + } +} +#endif /* defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_CTR) */ + +/****************************************************************** + * CTR DRBG callback functions + ******************************************************************/ + +#ifdef CONFIG_CRYPTO_DRBG_CTR +#define CRYPTO_DRBG_CTR_STRING "CTR " +static int drbg_kcapi_sym(struct drbg_state *drbg, const unsigned char *key, + unsigned char *outval, const struct drbg_string *in); +static int drbg_init_sym_kernel(struct drbg_state *drbg); +static int drbg_fini_sym_kernel(struct drbg_state *drbg); + +/* BCC function for CTR DRBG as defined in 10.4.3 */ +static int drbg_ctr_bcc(struct drbg_state *drbg, + unsigned char *out, const unsigned char *key, + struct list_head *in) +{ + int ret = 0; + struct drbg_string *curr = NULL; + struct drbg_string data; + short cnt = 0; + + drbg_string_fill(&data, out, drbg_blocklen(drbg)); + + /* 10.4.3 step 1 */ + memset(out, 0, drbg_blocklen(drbg)); + + /* 10.4.3 step 2 / 4 */ + list_for_each_entry(curr, in, list) { + const unsigned char *pos = curr->buf; + size_t len = curr->len; + /* 10.4.3 step 4.1 */ + while (len) { + /* 10.4.3 step 4.2 */ + if (drbg_blocklen(drbg) == cnt) { + cnt = 0; + ret = drbg_kcapi_sym(drbg, key, out, &data); + if (ret) + return ret; + } + out[cnt] ^= *pos; + pos++; + cnt++; + len--; + } + } + /* 10.4.3 step 4.2 for last block */ + if (cnt) + ret = drbg_kcapi_sym(drbg, key, out, &data); + + return ret; +} + +/* + * scratchpad usage: drbg_ctr_update is interlinked with drbg_ctr_df + * (and drbg_ctr_bcc, but this function does not need any temporary buffers), + * the scratchpad is used as follows: + * drbg_ctr_update: + * temp + * start: drbg->scratchpad + * length: drbg_statelen(drbg) + drbg_blocklen(drbg) + * note: the cipher writing into this variable works + * blocklen-wise. Now, when the statelen is not a multiple + * of blocklen, the generateion loop below "spills over" + * by at most blocklen. Thus, we need to give sufficient + * memory. + * df_data + * start: drbg->scratchpad + + * drbg_statelen(drbg) + drbg_blocklen(drbg) + * length: drbg_statelen(drbg) + * + * drbg_ctr_df: + * pad + * start: df_data + drbg_statelen(drbg) + * length: drbg_blocklen(drbg) + * iv + * start: pad + drbg_blocklen(drbg) + * length: drbg_blocklen(drbg) + * temp + * start: iv + drbg_blocklen(drbg) + * length: drbg_satelen(drbg) + drbg_blocklen(drbg) + * note: temp is the buffer that the BCC function operates + * on. BCC operates blockwise. drbg_statelen(drbg) + * is sufficient when the DRBG state length is a multiple + * of the block size. For AES192 (and maybe other ciphers) + * this is not correct and the length for temp is + * insufficient (yes, that also means for such ciphers, + * the final output of all BCC rounds are truncated). + * Therefore, add drbg_blocklen(drbg) to cover all + * possibilities. + */ + +/* Derivation Function for CTR DRBG as defined in 10.4.2 */ +static int drbg_ctr_df(struct drbg_state *drbg, + unsigned char *df_data, size_t bytes_to_return, + struct list_head *seedlist) +{ + int ret = -EFAULT; + unsigned char L_N[8]; + /* S3 is input */ + struct drbg_string S1, S2, S4, cipherin; + LIST_HEAD(bcc_list); + unsigned char *pad = df_data + drbg_statelen(drbg); + unsigned char *iv = pad + drbg_blocklen(drbg); + unsigned char *temp = iv + drbg_blocklen(drbg); + size_t padlen = 0; + unsigned int templen = 0; + /* 10.4.2 step 7 */ + unsigned int i = 0; + /* 10.4.2 step 8 */ + const unsigned char *K = (unsigned char *) + "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"; + unsigned char *X; + size_t generated_len = 0; + size_t inputlen = 0; + struct drbg_string *seed = NULL; + + memset(pad, 0, drbg_blocklen(drbg)); + memset(iv, 0, drbg_blocklen(drbg)); + memset(temp, 0, drbg_statelen(drbg)); + + /* 10.4.2 step 1 is implicit as we work byte-wise */ + + /* 10.4.2 step 2 */ + if ((512/8) < bytes_to_return) + return -EINVAL; + + /* 10.4.2 step 2 -- calculate the entire length of all input data */ + list_for_each_entry(seed, seedlist, list) + inputlen += seed->len; + drbg_int2byte(&L_N[0], inputlen, 4); + + /* 10.4.2 step 3 */ + drbg_int2byte(&L_N[4], bytes_to_return, 4); + + /* 10.4.2 step 5: length is L_N, input_string, one byte, padding */ + padlen = (inputlen + sizeof(L_N) + 1) % (drbg_blocklen(drbg)); + /* wrap the padlen appropriately */ + if (padlen) + padlen = drbg_blocklen(drbg) - padlen; + /* + * pad / padlen contains the 0x80 byte and the following zero bytes. + * As the calculated padlen value only covers the number of zero + * bytes, this value has to be incremented by one for the 0x80 byte. + */ + padlen++; + pad[0] = 0x80; + + /* 10.4.2 step 4 -- first fill the linked list and then order it */ + drbg_string_fill(&S1, iv, drbg_blocklen(drbg)); + list_add_tail(&S1.list, &bcc_list); + drbg_string_fill(&S2, L_N, sizeof(L_N)); + list_add_tail(&S2.list, &bcc_list); + list_splice_tail(seedlist, &bcc_list); + drbg_string_fill(&S4, pad, padlen); + list_add_tail(&S4.list, &bcc_list); + + /* 10.4.2 step 9 */ + while (templen < (drbg_keylen(drbg) + (drbg_blocklen(drbg)))) { + /* + * 10.4.2 step 9.1 - the padding is implicit as the buffer + * holds zeros after allocation -- even the increment of i + * is irrelevant as the increment remains within length of i + */ + drbg_int2byte(iv, i, 4); + /* 10.4.2 step 9.2 -- BCC and concatenation with temp */ + ret = drbg_ctr_bcc(drbg, temp + templen, K, &bcc_list); + if (ret) + goto out; + /* 10.4.2 step 9.3 */ + i++; + templen += drbg_blocklen(drbg); + } + + /* 10.4.2 step 11 */ + X = temp + (drbg_keylen(drbg)); + drbg_string_fill(&cipherin, X, drbg_blocklen(drbg)); + + /* 10.4.2 step 12: overwriting of outval is implemented in next step */ + + /* 10.4.2 step 13 */ + while (generated_len < bytes_to_return) { + short blocklen = 0; + /* + * 10.4.2 step 13.1: the truncation of the key length is + * implicit as the key is only drbg_blocklen in size based on + * the implementation of the cipher function callback + */ + ret = drbg_kcapi_sym(drbg, temp, X, &cipherin); + if (ret) + goto out; + blocklen = (drbg_blocklen(drbg) < + (bytes_to_return - generated_len)) ? + drbg_blocklen(drbg) : + (bytes_to_return - generated_len); + /* 10.4.2 step 13.2 and 14 */ + memcpy(df_data + generated_len, X, blocklen); + generated_len += blocklen; + } + + ret = 0; + +out: + memset(iv, 0, drbg_blocklen(drbg)); + memset(temp, 0, drbg_statelen(drbg)); + memset(pad, 0, drbg_blocklen(drbg)); + return ret; +} + +/* + * update function of CTR DRBG as defined in 10.2.1.2 + * + * The reseed variable has an enhanced meaning compared to the update + * functions of the other DRBGs as follows: + * 0 => initial seed from initialization + * 1 => reseed via drbg_seed + * 2 => first invocation from drbg_ctr_update when addtl is present. In + * this case, the df_data scratchpad is not deleted so that it is + * available for another calls to prevent calling the DF function + * again. + * 3 => second invocation from drbg_ctr_update. When the update function + * was called with addtl, the df_data memory already contains the + * DFed addtl information and we do not need to call DF again. + */ +static int drbg_ctr_update(struct drbg_state *drbg, struct list_head *seed, + int reseed) +{ + int ret = -EFAULT; + /* 10.2.1.2 step 1 */ + unsigned char *temp = drbg->scratchpad; + unsigned char *df_data = drbg->scratchpad + drbg_statelen(drbg) + + drbg_blocklen(drbg); + unsigned char *temp_p, *df_data_p; /* pointer to iterate over buffers */ + unsigned int len = 0; + struct drbg_string cipherin; + unsigned char prefix = DRBG_PREFIX1; + + memset(temp, 0, drbg_statelen(drbg) + drbg_blocklen(drbg)); + if (3 > reseed) + memset(df_data, 0, drbg_statelen(drbg)); + + /* 10.2.1.3.2 step 2 and 10.2.1.4.2 step 2 */ + if (seed) { + ret = drbg_ctr_df(drbg, df_data, drbg_statelen(drbg), seed); + if (ret) + goto out; + } + + drbg_string_fill(&cipherin, drbg->V, drbg_blocklen(drbg)); + /* + * 10.2.1.3.2 steps 2 and 3 are already covered as the allocation + * zeroizes all memory during initialization + */ + while (len < (drbg_statelen(drbg))) { + /* 10.2.1.2 step 2.1 */ + drbg_add_buf(drbg->V, drbg_blocklen(drbg), &prefix, 1); + /* + * 10.2.1.2 step 2.2 */ + ret = drbg_kcapi_sym(drbg, drbg->C, temp + len, &cipherin); + if (ret) + goto out; + /* 10.2.1.2 step 2.3 and 3 */ + len += drbg_blocklen(drbg); + } + + /* 10.2.1.2 step 4 */ + temp_p = temp; + df_data_p = df_data; + for (len = 0; len < drbg_statelen(drbg); len++) { + *temp_p ^= *df_data_p; + df_data_p++; temp_p++; + } + + /* 10.2.1.2 step 5 */ + memcpy(drbg->C, temp, drbg_keylen(drbg)); + /* 10.2.1.2 step 6 */ + memcpy(drbg->V, temp + drbg_keylen(drbg), drbg_blocklen(drbg)); + ret = 0; + +out: + memset(temp, 0, drbg_statelen(drbg) + drbg_blocklen(drbg)); + if (2 != reseed) + memset(df_data, 0, drbg_statelen(drbg)); + return ret; +} + +/* + * scratchpad use: drbg_ctr_update is called independently from + * drbg_ctr_extract_bytes. Therefore, the scratchpad is reused + */ +/* Generate function of CTR DRBG as defined in 10.2.1.5.2 */ +static int drbg_ctr_generate(struct drbg_state *drbg, + unsigned char *buf, unsigned int buflen, + struct list_head *addtl) +{ + int len = 0; + int ret = 0; + struct drbg_string data; + unsigned char prefix = DRBG_PREFIX1; + + memset(drbg->scratchpad, 0, drbg_blocklen(drbg)); + + /* 10.2.1.5.2 step 2 */ + if (addtl && !list_empty(addtl)) { + ret = drbg_ctr_update(drbg, addtl, 2); + if (ret) + return 0; + } + + /* 10.2.1.5.2 step 4.1 */ + drbg_add_buf(drbg->V, drbg_blocklen(drbg), &prefix, 1); + drbg_string_fill(&data, drbg->V, drbg_blocklen(drbg)); + while (len < buflen) { + int outlen = 0; + /* 10.2.1.5.2 step 4.2 */ + ret = drbg_kcapi_sym(drbg, drbg->C, drbg->scratchpad, &data); + if (ret) { + len = ret; + goto out; + } + outlen = (drbg_blocklen(drbg) < (buflen - len)) ? + drbg_blocklen(drbg) : (buflen - len); + if (!drbg_fips_continuous_test(drbg, drbg->scratchpad)) { + /* 10.2.1.5.2 step 6 */ + drbg_add_buf(drbg->V, drbg_blocklen(drbg), &prefix, 1); + continue; + } + /* 10.2.1.5.2 step 4.3 */ + memcpy(buf + len, drbg->scratchpad, outlen); + len += outlen; + /* 10.2.1.5.2 step 6 */ + if (len < buflen) + drbg_add_buf(drbg->V, drbg_blocklen(drbg), &prefix, 1); + } + + /* 10.2.1.5.2 step 6 */ + ret = drbg_ctr_update(drbg, NULL, 3); + if (ret) + len = ret; + +out: + memset(drbg->scratchpad, 0, drbg_blocklen(drbg)); + return len; +} + +static struct drbg_state_ops drbg_ctr_ops = { + .update = drbg_ctr_update, + .generate = drbg_ctr_generate, + .crypto_init = drbg_init_sym_kernel, + .crypto_fini = drbg_fini_sym_kernel, +}; +#endif /* CONFIG_CRYPTO_DRBG_CTR */ + +/****************************************************************** + * HMAC DRBG callback functions + ******************************************************************/ + +#if defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_HMAC) +static int drbg_kcapi_hash(struct drbg_state *drbg, const unsigned char *key, + unsigned char *outval, const struct list_head *in); +static int drbg_init_hash_kernel(struct drbg_state *drbg); +static int drbg_fini_hash_kernel(struct drbg_state *drbg); +#endif /* (CONFIG_CRYPTO_DRBG_HASH || CONFIG_CRYPTO_DRBG_HMAC) */ + +#ifdef CONFIG_CRYPTO_DRBG_HMAC +#define CRYPTO_DRBG_HMAC_STRING "HMAC " +/* update function of HMAC DRBG as defined in 10.1.2.2 */ +static int drbg_hmac_update(struct drbg_state *drbg, struct list_head *seed, + int reseed) +{ + int ret = -EFAULT; + int i = 0; + struct drbg_string seed1, seed2, vdata; + LIST_HEAD(seedlist); + LIST_HEAD(vdatalist); + + if (!reseed) { + /* 10.1.2.3 step 2 */ + memset(drbg->C, 0, drbg_statelen(drbg)); + memset(drbg->V, 1, drbg_statelen(drbg)); + } + + drbg_string_fill(&seed1, drbg->V, drbg_statelen(drbg)); + list_add_tail(&seed1.list, &seedlist); + /* buffer of seed2 will be filled in for loop below with one byte */ + drbg_string_fill(&seed2, NULL, 1); + list_add_tail(&seed2.list, &seedlist); + /* input data of seed is allowed to be NULL at this point */ + if (seed) + list_splice_tail(seed, &seedlist); + + drbg_string_fill(&vdata, drbg->V, drbg_statelen(drbg)); + list_add_tail(&vdata.list, &vdatalist); + for (i = 2; 0 < i; i--) { + /* first round uses 0x0, second 0x1 */ + unsigned char prefix = DRBG_PREFIX0; + if (1 == i) + prefix = DRBG_PREFIX1; + /* 10.1.2.2 step 1 and 4 -- concatenation and HMAC for key */ + seed2.buf = &prefix; + ret = drbg_kcapi_hash(drbg, drbg->C, drbg->C, &seedlist); + if (ret) + return ret; + + /* 10.1.2.2 step 2 and 5 -- HMAC for V */ + ret = drbg_kcapi_hash(drbg, drbg->C, drbg->V, &vdatalist); + if (ret) + return ret; + + /* 10.1.2.2 step 3 */ + if (!seed) + return ret; + } + + return 0; +} + +/* generate function of HMAC DRBG as defined in 10.1.2.5 */ +static int drbg_hmac_generate(struct drbg_state *drbg, + unsigned char *buf, + unsigned int buflen, + struct list_head *addtl) +{ + int len = 0; + int ret = 0; + struct drbg_string data; + LIST_HEAD(datalist); + + /* 10.1.2.5 step 2 */ + if (addtl && !list_empty(addtl)) { + ret = drbg_hmac_update(drbg, addtl, 1); + if (ret) + return ret; + } + + drbg_string_fill(&data, drbg->V, drbg_statelen(drbg)); + list_add_tail(&data.list, &datalist); + while (len < buflen) { + unsigned int outlen = 0; + /* 10.1.2.5 step 4.1 */ + ret = drbg_kcapi_hash(drbg, drbg->C, drbg->V, &datalist); + if (ret) + return ret; + outlen = (drbg_blocklen(drbg) < (buflen - len)) ? + drbg_blocklen(drbg) : (buflen - len); + if (!drbg_fips_continuous_test(drbg, drbg->V)) + continue; + + /* 10.1.2.5 step 4.2 */ + memcpy(buf + len, drbg->V, outlen); + len += outlen; + } + + /* 10.1.2.5 step 6 */ + if (addtl && !list_empty(addtl)) + ret = drbg_hmac_update(drbg, addtl, 1); + else + ret = drbg_hmac_update(drbg, NULL, 1); + if (ret) + return ret; + + return len; +} + +static struct drbg_state_ops drbg_hmac_ops = { + .update = drbg_hmac_update, + .generate = drbg_hmac_generate, + .crypto_init = drbg_init_hash_kernel, + .crypto_fini = drbg_fini_hash_kernel, + +}; +#endif /* CONFIG_CRYPTO_DRBG_HMAC */ + +/****************************************************************** + * Hash DRBG callback functions + ******************************************************************/ + +#ifdef CONFIG_CRYPTO_DRBG_HASH +#define CRYPTO_DRBG_HASH_STRING "HASH " +/* + * scratchpad usage: as drbg_hash_update and drbg_hash_df are used + * interlinked, the scratchpad is used as follows: + * drbg_hash_update + * start: drbg->scratchpad + * length: drbg_statelen(drbg) + * drbg_hash_df: + * start: drbg->scratchpad + drbg_statelen(drbg) + * length: drbg_blocklen(drbg) + * + * drbg_hash_process_addtl uses the scratchpad, but fully completes + * before either of the functions mentioned before are invoked. Therefore, + * drbg_hash_process_addtl does not need to be specifically considered. + */ + +/* Derivation Function for Hash DRBG as defined in 10.4.1 */ +static int drbg_hash_df(struct drbg_state *drbg, + unsigned char *outval, size_t outlen, + struct list_head *entropylist) +{ + int ret = 0; + size_t len = 0; + unsigned char input[5]; + unsigned char *tmp = drbg->scratchpad + drbg_statelen(drbg); + struct drbg_string data; + + memset(tmp, 0, drbg_blocklen(drbg)); + + /* 10.4.1 step 3 */ + input[0] = 1; + drbg_int2byte(&input[1], (outlen * 8), 4); + + /* 10.4.1 step 4.1 -- concatenation of data for input into hash */ + drbg_string_fill(&data, input, 5); + list_add(&data.list, entropylist); + + /* 10.4.1 step 4 */ + while (len < outlen) { + short blocklen = 0; + /* 10.4.1 step 4.1 */ + ret = drbg_kcapi_hash(drbg, NULL, tmp, entropylist); + if (ret) + goto out; + /* 10.4.1 step 4.2 */ + input[0]++; + blocklen = (drbg_blocklen(drbg) < (outlen - len)) ? + drbg_blocklen(drbg) : (outlen - len); + memcpy(outval + len, tmp, blocklen); + len += blocklen; + } + +out: + memset(tmp, 0, drbg_blocklen(drbg)); + return ret; +} + +/* update function for Hash DRBG as defined in 10.1.1.2 / 10.1.1.3 */ +static int drbg_hash_update(struct drbg_state *drbg, struct list_head *seed, + int reseed) +{ + int ret = 0; + struct drbg_string data1, data2; + LIST_HEAD(datalist); + LIST_HEAD(datalist2); + unsigned char *V = drbg->scratchpad; + unsigned char prefix = DRBG_PREFIX1; + + memset(drbg->scratchpad, 0, drbg_statelen(drbg)); + if (!seed) + return -EINVAL; + + if (reseed) { + /* 10.1.1.3 step 1 */ + memcpy(V, drbg->V, drbg_statelen(drbg)); + drbg_string_fill(&data1, &prefix, 1); + list_add_tail(&data1.list, &datalist); + drbg_string_fill(&data2, V, drbg_statelen(drbg)); + list_add_tail(&data2.list, &datalist); + } + list_splice_tail(seed, &datalist); + + /* 10.1.1.2 / 10.1.1.3 step 2 and 3 */ + ret = drbg_hash_df(drbg, drbg->V, drbg_statelen(drbg), &datalist); + if (ret) + goto out; + + /* 10.1.1.2 / 10.1.1.3 step 4 */ + prefix = DRBG_PREFIX0; + drbg_string_fill(&data1, &prefix, 1); + list_add_tail(&data1.list, &datalist2); + drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg)); + list_add_tail(&data2.list, &datalist2); + /* 10.1.1.2 / 10.1.1.3 step 4 */ + ret = drbg_hash_df(drbg, drbg->C, drbg_statelen(drbg), &datalist2); + +out: + memset(drbg->scratchpad, 0, drbg_statelen(drbg)); + return ret; +} + +/* processing of additional information string for Hash DRBG */ +static int drbg_hash_process_addtl(struct drbg_state *drbg, + struct list_head *addtl) +{ + int ret = 0; + struct drbg_string data1, data2; + LIST_HEAD(datalist); + unsigned char prefix = DRBG_PREFIX2; + + /* this is value w as per documentation */ + memset(drbg->scratchpad, 0, drbg_blocklen(drbg)); + + /* 10.1.1.4 step 2 */ + if (!addtl || list_empty(addtl)) + return 0; + + /* 10.1.1.4 step 2a */ + drbg_string_fill(&data1, &prefix, 1); + drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg)); + list_add_tail(&data1.list, &datalist); + list_add_tail(&data2.list, &datalist); + list_splice_tail(addtl, &datalist); + ret = drbg_kcapi_hash(drbg, NULL, drbg->scratchpad, &datalist); + if (ret) + goto out; + + /* 10.1.1.4 step 2b */ + drbg_add_buf(drbg->V, drbg_statelen(drbg), + drbg->scratchpad, drbg_blocklen(drbg)); + +out: + memset(drbg->scratchpad, 0, drbg_blocklen(drbg)); + return ret; +} + +/* Hashgen defined in 10.1.1.4 */ +static int drbg_hash_hashgen(struct drbg_state *drbg, + unsigned char *buf, + unsigned int buflen) +{ + int len = 0; + int ret = 0; + unsigned char *src = drbg->scratchpad; + unsigned char *dst = drbg->scratchpad + drbg_statelen(drbg); + struct drbg_string data; + LIST_HEAD(datalist); + unsigned char prefix = DRBG_PREFIX1; + + memset(src, 0, drbg_statelen(drbg)); + memset(dst, 0, drbg_blocklen(drbg)); + + /* 10.1.1.4 step hashgen 2 */ + memcpy(src, drbg->V, drbg_statelen(drbg)); + + drbg_string_fill(&data, src, drbg_statelen(drbg)); + list_add_tail(&data.list, &datalist); + while (len < buflen) { + unsigned int outlen = 0; + /* 10.1.1.4 step hashgen 4.1 */ + ret = drbg_kcapi_hash(drbg, NULL, dst, &datalist); + if (ret) { + len = ret; + goto out; + } + outlen = (drbg_blocklen(drbg) < (buflen - len)) ? + drbg_blocklen(drbg) : (buflen - len); + if (!drbg_fips_continuous_test(drbg, dst)) { + drbg_add_buf(src, drbg_statelen(drbg), &prefix, 1); + continue; + } + /* 10.1.1.4 step hashgen 4.2 */ + memcpy(buf + len, dst, outlen); + len += outlen; + /* 10.1.1.4 hashgen step 4.3 */ + if (len < buflen) + drbg_add_buf(src, drbg_statelen(drbg), &prefix, 1); + } + +out: + memset(drbg->scratchpad, 0, + (drbg_statelen(drbg) + drbg_blocklen(drbg))); + return len; +} + +/* generate function for Hash DRBG as defined in 10.1.1.4 */ +static int drbg_hash_generate(struct drbg_state *drbg, + unsigned char *buf, unsigned int buflen, + struct list_head *addtl) +{ + int len = 0; + int ret = 0; + unsigned char req[8]; + unsigned char prefix = DRBG_PREFIX3; + struct drbg_string data1, data2; + LIST_HEAD(datalist); + + /* 10.1.1.4 step 2 */ + ret = drbg_hash_process_addtl(drbg, addtl); + if (ret) + return ret; + /* 10.1.1.4 step 3 */ + len = drbg_hash_hashgen(drbg, buf, buflen); + + /* this is the value H as documented in 10.1.1.4 */ + memset(drbg->scratchpad, 0, drbg_blocklen(drbg)); + /* 10.1.1.4 step 4 */ + drbg_string_fill(&data1, &prefix, 1); + list_add_tail(&data1.list, &datalist); + drbg_string_fill(&data2, drbg->V, drbg_statelen(drbg)); + list_add_tail(&data2.list, &datalist); + ret = drbg_kcapi_hash(drbg, NULL, drbg->scratchpad, &datalist); + if (ret) { + len = ret; + goto out; + } + + /* 10.1.1.4 step 5 */ + drbg_add_buf(drbg->V, drbg_statelen(drbg), + drbg->scratchpad, drbg_blocklen(drbg)); + drbg_add_buf(drbg->V, drbg_statelen(drbg), + drbg->C, drbg_statelen(drbg)); + drbg_int2byte(req, drbg->reseed_ctr, sizeof(req)); + drbg_add_buf(drbg->V, drbg_statelen(drbg), req, 8); + +out: + memset(drbg->scratchpad, 0, drbg_blocklen(drbg)); + return len; +} + +/* + * scratchpad usage: as update and generate are used isolated, both + * can use the scratchpad + */ +static struct drbg_state_ops drbg_hash_ops = { + .update = drbg_hash_update, + .generate = drbg_hash_generate, + .crypto_init = drbg_init_hash_kernel, + .crypto_fini = drbg_fini_hash_kernel, +}; +#endif /* CONFIG_CRYPTO_DRBG_HASH */ + +/****************************************************************** + * Functions common for DRBG implementations + ******************************************************************/ + +/* + * Seeding or reseeding of the DRBG + * + * @drbg: DRBG state struct + * @pers: personalization / additional information buffer + * @reseed: 0 for initial seed process, 1 for reseeding + * + * return: + * 0 on success + * error value otherwise + */ +static int drbg_seed(struct drbg_state *drbg, struct drbg_string *pers, + bool reseed) +{ + int ret = 0; + unsigned char *entropy = NULL; + size_t entropylen = 0; + struct drbg_string data1; + LIST_HEAD(seedlist); + + /* 9.1 / 9.2 / 9.3.1 step 3 */ + if (pers && pers->len > (drbg_max_addtl(drbg))) { + pr_devel("DRBG: personalization string too long %zu\n", + pers->len); + return -EINVAL; + } + + if (drbg->test_data && drbg->test_data->testentropy) { + drbg_string_fill(&data1, drbg->test_data->testentropy->buf, + drbg->test_data->testentropy->len); + pr_devel("DRBG: using test entropy\n"); + } else { + /* + * Gather entropy equal to the security strength of the DRBG. + * With a derivation function, a nonce is required in addition + * to the entropy. A nonce must be at least 1/2 of the security + * strength of the DRBG in size. Thus, entropy * nonce is 3/2 + * of the strength. The consideration of a nonce is only + * applicable during initial seeding. + */ + entropylen = drbg_sec_strength(drbg->core->flags); + if (!entropylen) + return -EFAULT; + if (!reseed) + entropylen = ((entropylen + 1) / 2) * 3; + pr_devel("DRBG: (re)seeding with %zu bytes of entropy\n", + entropylen); + entropy = kzalloc(entropylen, GFP_KERNEL); + if (!entropy) + return -ENOMEM; + get_random_bytes(entropy, entropylen); + drbg_string_fill(&data1, entropy, entropylen); + } + list_add_tail(&data1.list, &seedlist); + + /* + * concatenation of entropy with personalization str / addtl input) + * the variable pers is directly handed in by the caller, so check its + * contents whether it is appropriate + */ + if (pers && pers->buf && 0 < pers->len) { + list_add_tail(&pers->list, &seedlist); + pr_devel("DRBG: using personalization string\n"); + } + + ret = drbg->d_ops->update(drbg, &seedlist, reseed); + if (ret) + goto out; + + drbg->seeded = true; + /* 10.1.1.2 / 10.1.1.3 step 5 */ + drbg->reseed_ctr = 1; + +out: + if (entropy) + kzfree(entropy); + return ret; +} + +/* Free all substructures in a DRBG state without the DRBG state structure */ +static inline void drbg_dealloc_state(struct drbg_state *drbg) +{ + if (!drbg) + return; + if (drbg->V) + kzfree(drbg->V); + drbg->V = NULL; + if (drbg->C) + kzfree(drbg->C); + drbg->C = NULL; + if (drbg->scratchpad) + kzfree(drbg->scratchpad); + drbg->scratchpad = NULL; + drbg->reseed_ctr = 0; +#ifdef CONFIG_CRYPTO_FIPS + if (drbg->prev) + kzfree(drbg->prev); + drbg->prev = NULL; + drbg->fips_primed = false; +#endif +} + +/* + * Allocate all sub-structures for a DRBG state. + * The DRBG state structure must already be allocated. + */ +static inline int drbg_alloc_state(struct drbg_state *drbg) +{ + int ret = -ENOMEM; + unsigned int sb_size = 0; + + if (!drbg) + return -EINVAL; + + drbg->V = kzalloc(drbg_statelen(drbg), GFP_KERNEL); + if (!drbg->V) + goto err; + drbg->C = kzalloc(drbg_statelen(drbg), GFP_KERNEL); + if (!drbg->C) + goto err; +#ifdef CONFIG_CRYPTO_FIPS + drbg->prev = kzalloc(drbg_blocklen(drbg), GFP_KERNEL); + if (!drbg->prev) + goto err; + drbg->fips_primed = false; +#endif + /* scratchpad is only generated for CTR and Hash */ + if (drbg->core->flags & DRBG_HMAC) + sb_size = 0; + else if (drbg->core->flags & DRBG_CTR) + sb_size = drbg_statelen(drbg) + drbg_blocklen(drbg) + /* temp */ + drbg_statelen(drbg) + /* df_data */ + drbg_blocklen(drbg) + /* pad */ + drbg_blocklen(drbg) + /* iv */ + drbg_statelen(drbg) + drbg_blocklen(drbg); /* temp */ + else + sb_size = drbg_statelen(drbg) + drbg_blocklen(drbg); + + if (0 < sb_size) { + drbg->scratchpad = kzalloc(sb_size, GFP_KERNEL); + if (!drbg->scratchpad) + goto err; + } + spin_lock_init(&drbg->drbg_lock); + return 0; + +err: + drbg_dealloc_state(drbg); + return ret; +} + +/* + * Strategy to avoid holding long term locks: generate a shadow copy of DRBG + * and perform all operations on this shadow copy. After finishing, restore + * the updated state of the shadow copy into original drbg state. This way, + * only the read and write operations of the original drbg state must be + * locked + */ +static inline void drbg_copy_drbg(struct drbg_state *src, + struct drbg_state *dst) +{ + if (!src || !dst) + return; + memcpy(dst->V, src->V, drbg_statelen(src)); + memcpy(dst->C, src->C, drbg_statelen(src)); + dst->reseed_ctr = src->reseed_ctr; + dst->seeded = src->seeded; + dst->pr = src->pr; +#ifdef CONFIG_CRYPTO_FIPS + dst->fips_primed = src->fips_primed; + memcpy(dst->prev, src->prev, drbg_blocklen(src)); +#endif + /* + * Not copied: + * scratchpad is initialized drbg_alloc_state; + * priv_data is initialized with call to crypto_init; + * d_ops and core are set outside, as these parameters are const; + * test_data is set outside to prevent it being copied back. + */ +} + +static int drbg_make_shadow(struct drbg_state *drbg, struct drbg_state **shadow) +{ + int ret = -ENOMEM; + struct drbg_state *tmp = NULL; + + if (!drbg || !drbg->core || !drbg->V || !drbg->C) { + pr_devel("DRBG: attempt to generate shadow copy for " + "uninitialized DRBG state rejected\n"); + return -EINVAL; + } + /* HMAC does not have a scratchpad */ + if (!(drbg->core->flags & DRBG_HMAC) && NULL == drbg->scratchpad) + return -EINVAL; + + tmp = kzalloc(sizeof(struct drbg_state), GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + /* read-only data as they are defined as const, no lock needed */ + tmp->core = drbg->core; + tmp->d_ops = drbg->d_ops; + + ret = drbg_alloc_state(tmp); + if (ret) + goto err; + + spin_lock_bh(&drbg->drbg_lock); + drbg_copy_drbg(drbg, tmp); + /* only make a link to the test buffer, as we only read that data */ + tmp->test_data = drbg->test_data; + spin_unlock_bh(&drbg->drbg_lock); + *shadow = tmp; + return 0; + +err: + if (tmp) + kzfree(tmp); + return ret; +} + +static void drbg_restore_shadow(struct drbg_state *drbg, + struct drbg_state **shadow) +{ + struct drbg_state *tmp = *shadow; + + spin_lock_bh(&drbg->drbg_lock); + drbg_copy_drbg(tmp, drbg); + spin_unlock_bh(&drbg->drbg_lock); + drbg_dealloc_state(tmp); + kzfree(tmp); + *shadow = NULL; +} + +/************************************************************************* + * DRBG interface functions + *************************************************************************/ + +/* + * DRBG generate function as required by SP800-90A - this function + * generates random numbers + * + * @drbg DRBG state handle + * @buf Buffer where to store the random numbers -- the buffer must already + * be pre-allocated by caller + * @buflen Length of output buffer - this value defines the number of random + * bytes pulled from DRBG + * @addtl Additional input that is mixed into state, may be NULL -- note + * the entropy is pulled by the DRBG internally unconditionally + * as defined in SP800-90A. The additional input is mixed into + * the state in addition to the pulled entropy. + * + * return: generated number of bytes + */ +static int drbg_generate(struct drbg_state *drbg, + unsigned char *buf, unsigned int buflen, + struct drbg_string *addtl) +{ + int len = 0; + struct drbg_state *shadow = NULL; + LIST_HEAD(addtllist); + struct drbg_string timestamp; + union { + cycles_t cycles; + unsigned char char_cycles[sizeof(cycles_t)]; + } now; + + if (0 == buflen || !buf) { + pr_devel("DRBG: no output buffer provided\n"); + return -EINVAL; + } + if (addtl && NULL == addtl->buf && 0 < addtl->len) { + pr_devel("DRBG: wrong format of additional information\n"); + return -EINVAL; + } + + len = drbg_make_shadow(drbg, &shadow); + if (len) { + pr_devel("DRBG: shadow copy cannot be generated\n"); + return len; + } + + /* 9.3.1 step 2 */ + len = -EINVAL; + if (buflen > (drbg_max_request_bytes(shadow))) { + pr_devel("DRBG: requested random numbers too large %u\n", + buflen); + goto err; + } + + /* 9.3.1 step 3 is implicit with the chosen DRBG */ + + /* 9.3.1 step 4 */ + if (addtl && addtl->len > (drbg_max_addtl(shadow))) { + pr_devel("DRBG: additional information string too long %zu\n", + addtl->len); + goto err; + } + /* 9.3.1 step 5 is implicit with the chosen DRBG */ + + /* + * 9.3.1 step 6 and 9 supplemented by 9.3.2 step c is implemented + * here. The spec is a bit convoluted here, we make it simpler. + */ + if ((drbg_max_requests(shadow)) < shadow->reseed_ctr) + shadow->seeded = false; + + /* allocate cipher handle */ + if (shadow->d_ops->crypto_init) { + len = shadow->d_ops->crypto_init(shadow); + if (len) + goto err; + } + + if (shadow->pr || !shadow->seeded) { + pr_devel("DRBG: reseeding before generation (prediction " + "resistance: %s, state %s)\n", + drbg->pr ? "true" : "false", + drbg->seeded ? "seeded" : "unseeded"); + /* 9.3.1 steps 7.1 through 7.3 */ + len = drbg_seed(shadow, addtl, true); + if (len) + goto err; + /* 9.3.1 step 7.4 */ + addtl = NULL; + } + + /* + * Mix the time stamp into the DRBG state if the DRBG is not in + * test mode. If there are two callers invoking the DRBG at the same + * time, i.e. before the first caller merges its shadow state back, + * both callers would obtain the same random number stream without + * changing the state here. + */ + if (!drbg->test_data) { + now.cycles = get_cycles(); + drbg_string_fill(×tamp, now.char_cycles, sizeof(cycles_t)); + list_add_tail(×tamp.list, &addtllist); + } + if (addtl && 0 < addtl->len) + list_add_tail(&addtl->list, &addtllist); + /* 9.3.1 step 8 and 10 */ + len = shadow->d_ops->generate(shadow, buf, buflen, &addtllist); + + /* 10.1.1.4 step 6, 10.1.2.5 step 7, 10.2.1.5.2 step 7 */ + shadow->reseed_ctr++; + if (0 >= len) + goto err; + + /* + * Section 11.3.3 requires to re-perform self tests after some + * generated random numbers. The chosen value after which self + * test is performed is arbitrary, but it should be reasonable. + * However, we do not perform the self tests because of the following + * reasons: it is mathematically impossible that the initial self tests + * were successfully and the following are not. If the initial would + * pass and the following would not, the kernel integrity is violated. + * In this case, the entire kernel operation is questionable and it + * is unlikely that the integrity violation only affects the + * correct operation of the DRBG. + * + * Albeit the following code is commented out, it is provided in + * case somebody has a need to implement the test of 11.3.3. + */ +#if 0 + if (shadow->reseed_ctr && !(shadow->reseed_ctr % 4096)) { + int err = 0; + pr_devel("DRBG: start to perform self test\n"); + if (drbg->core->flags & DRBG_HMAC) + err = alg_test("drbg_pr_hmac_sha256", + "drbg_pr_hmac_sha256", 0, 0); + else if (drbg->core->flags & DRBG_CTR) + err = alg_test("drbg_pr_ctr_aes128", + "drbg_pr_ctr_aes128", 0, 0); + else + err = alg_test("drbg_pr_sha256", + "drbg_pr_sha256", 0, 0); + if (err) { + pr_err("DRBG: periodical self test failed\n"); + /* + * uninstantiate implies that from now on, only errors + * are returned when reusing this DRBG cipher handle + */ + drbg_uninstantiate(drbg); + drbg_dealloc_state(shadow); + kzfree(shadow); + return 0; + } else { + pr_devel("DRBG: self test successful\n"); + } + } +#endif + +err: + if (shadow->d_ops->crypto_fini) + shadow->d_ops->crypto_fini(shadow); + drbg_restore_shadow(drbg, &shadow); + return len; +} + +/* + * Wrapper around drbg_generate which can pull arbitrary long strings + * from the DRBG without hitting the maximum request limitation. + * + * Parameters: see drbg_generate + * Return codes: see drbg_generate -- if one drbg_generate request fails, + * the entire drbg_generate_long request fails + */ +static int drbg_generate_long(struct drbg_state *drbg, + unsigned char *buf, unsigned int buflen, + struct drbg_string *addtl) +{ + int len = 0; + unsigned int slice = 0; + do { + int tmplen = 0; + unsigned int chunk = 0; + slice = ((buflen - len) / drbg_max_request_bytes(drbg)); + chunk = slice ? drbg_max_request_bytes(drbg) : (buflen - len); + tmplen = drbg_generate(drbg, buf + len, chunk, addtl); + if (0 >= tmplen) + return tmplen; + len += tmplen; + } while (slice > 0 && (len < buflen)); + return len; +} + +/* + * DRBG instantiation function as required by SP800-90A - this function + * sets up the DRBG handle, performs the initial seeding and all sanity + * checks required by SP800-90A + * + * @drbg memory of state -- if NULL, new memory is allocated + * @pers Personalization string that is mixed into state, may be NULL -- note + * the entropy is pulled by the DRBG internally unconditionally + * as defined in SP800-90A. The additional input is mixed into + * the state in addition to the pulled entropy. + * @coreref reference to core + * @pr prediction resistance enabled + * + * return + * 0 on success + * error value otherwise + */ +static int drbg_instantiate(struct drbg_state *drbg, struct drbg_string *pers, + int coreref, bool pr) +{ + int ret = -ENOMEM; + + pr_devel("DRBG: Initializing DRBG core %d with prediction resistance " + "%s\n", coreref, pr ? "enabled" : "disabled"); + drbg->core = &drbg_cores[coreref]; + drbg->pr = pr; + drbg->seeded = false; + switch (drbg->core->flags & DRBG_TYPE_MASK) { +#ifdef CONFIG_CRYPTO_DRBG_HMAC + case DRBG_HMAC: + drbg->d_ops = &drbg_hmac_ops; + break; +#endif /* CONFIG_CRYPTO_DRBG_HMAC */ +#ifdef CONFIG_CRYPTO_DRBG_HASH + case DRBG_HASH: + drbg->d_ops = &drbg_hash_ops; + break; +#endif /* CONFIG_CRYPTO_DRBG_HASH */ +#ifdef CONFIG_CRYPTO_DRBG_CTR + case DRBG_CTR: + drbg->d_ops = &drbg_ctr_ops; + break; +#endif /* CONFIG_CRYPTO_DRBG_CTR */ + default: + return -EOPNOTSUPP; + } + + /* 9.1 step 1 is implicit with the selected DRBG type */ + + /* + * 9.1 step 2 is implicit as caller can select prediction resistance + * and the flag is copied into drbg->flags -- + * all DRBG types support prediction resistance + */ + + /* 9.1 step 4 is implicit in drbg_sec_strength */ + + ret = drbg_alloc_state(drbg); + if (ret) + return ret; + + ret = -EFAULT; + if (drbg->d_ops->crypto_init && drbg->d_ops->crypto_init(drbg)) + goto err; + ret = drbg_seed(drbg, pers, false); + if (drbg->d_ops->crypto_fini) + drbg->d_ops->crypto_fini(drbg); + if (ret) + goto err; + + return 0; + +err: + drbg_dealloc_state(drbg); + return ret; +} + +/* + * DRBG uninstantiate function as required by SP800-90A - this function + * frees all buffers and the DRBG handle + * + * @drbg DRBG state handle + * + * return + * 0 on success + */ +static int drbg_uninstantiate(struct drbg_state *drbg) +{ + spin_lock_bh(&drbg->drbg_lock); + drbg_dealloc_state(drbg); + /* no scrubbing of test_data -- this shall survive an uninstantiate */ + spin_unlock_bh(&drbg->drbg_lock); + return 0; +} + +/* + * Helper function for setting the test data in the DRBG + * + * @drbg DRBG state handle + * @test_data test data to sets + */ +static inline void drbg_set_testdata(struct drbg_state *drbg, + struct drbg_test_data *test_data) +{ + if (!test_data || !test_data->testentropy) + return; + spin_lock_bh(&drbg->drbg_lock); + drbg->test_data = test_data; + spin_unlock_bh(&drbg->drbg_lock); +} + +/*************************************************************** + * Kernel crypto API cipher invocations requested by DRBG + ***************************************************************/ + +#if defined(CONFIG_CRYPTO_DRBG_HASH) || defined(CONFIG_CRYPTO_DRBG_HMAC) +struct sdesc { + struct shash_desc shash; + char ctx[]; +}; + +static int drbg_init_hash_kernel(struct drbg_state *drbg) +{ + struct sdesc *sdesc; + struct crypto_shash *tfm; + + tfm = crypto_alloc_shash(drbg->core->backend_cra_name, 0, 0); + if (IS_ERR(tfm)) { + pr_info("DRBG: could not allocate digest TFM handle\n"); + return PTR_ERR(tfm); + } + BUG_ON(drbg_blocklen(drbg) != crypto_shash_digestsize(tfm)); + sdesc = kzalloc(sizeof(struct shash_desc) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!sdesc) { + crypto_free_shash(tfm); + return -ENOMEM; + } + + sdesc->shash.tfm = tfm; + sdesc->shash.flags = 0; + drbg->priv_data = sdesc; + return 0; +} + +static int drbg_fini_hash_kernel(struct drbg_state *drbg) +{ + struct sdesc *sdesc = (struct sdesc *)drbg->priv_data; + if (sdesc) { + crypto_free_shash(sdesc->shash.tfm); + kzfree(sdesc); + } + drbg->priv_data = NULL; + return 0; +} + +static int drbg_kcapi_hash(struct drbg_state *drbg, const unsigned char *key, + unsigned char *outval, const struct list_head *in) +{ + struct sdesc *sdesc = (struct sdesc *)drbg->priv_data; + struct drbg_string *input = NULL; + + if (key) + crypto_shash_setkey(sdesc->shash.tfm, key, drbg_statelen(drbg)); + crypto_shash_init(&sdesc->shash); + list_for_each_entry(input, in, list) + crypto_shash_update(&sdesc->shash, input->buf, input->len); + return crypto_shash_final(&sdesc->shash, outval); +} +#endif /* (CONFIG_CRYPTO_DRBG_HASH || CONFIG_CRYPTO_DRBG_HMAC) */ + +#ifdef CONFIG_CRYPTO_DRBG_CTR +static int drbg_init_sym_kernel(struct drbg_state *drbg) +{ + int ret = 0; + struct crypto_blkcipher *tfm; + + tfm = crypto_alloc_blkcipher(drbg->core->backend_cra_name, 0, 0); + if (IS_ERR(tfm)) { + pr_info("DRBG: could not allocate cipher TFM handle\n"); + return PTR_ERR(tfm); + } + BUG_ON(drbg_blocklen(drbg) != crypto_blkcipher_blocksize(tfm)); + drbg->priv_data = tfm; + return ret; +} + +static int drbg_fini_sym_kernel(struct drbg_state *drbg) +{ + struct crypto_blkcipher *tfm = + (struct crypto_blkcipher *)drbg->priv_data; + if (tfm) + crypto_free_blkcipher(tfm); + drbg->priv_data = NULL; + return 0; +} + +static int drbg_kcapi_sym(struct drbg_state *drbg, const unsigned char *key, + unsigned char *outval, const struct drbg_string *in) +{ + int ret = 0; + struct scatterlist sg_in, sg_out; + struct blkcipher_desc desc; + struct crypto_blkcipher *tfm = + (struct crypto_blkcipher *)drbg->priv_data; + + desc.tfm = tfm; + desc.flags = 0; + crypto_blkcipher_setkey(tfm, key, (drbg_keylen(drbg))); + /* there is only component in *in */ + sg_init_one(&sg_in, in->buf, in->len); + sg_init_one(&sg_out, outval, drbg_blocklen(drbg)); + ret = crypto_blkcipher_encrypt(&desc, &sg_out, &sg_in, in->len); + + return ret; +} +#endif /* CONFIG_CRYPTO_DRBG_CTR */ + +/*************************************************************** + * Kernel crypto API interface to register DRBG + ***************************************************************/ + +/* + * Look up the DRBG flags by given kernel crypto API cra_name + * The code uses the drbg_cores definition to do this + * + * @cra_name kernel crypto API cra_name + * @coreref reference to integer which is filled with the pointer to + * the applicable core + * @pr reference for setting prediction resistance + * + * return: flags + */ +static inline void drbg_convert_tfm_core(const char *cra_driver_name, + int *coreref, bool *pr) +{ + int i = 0; + size_t start = 0; + int len = 0; + + *pr = true; + /* disassemble the names */ + if (!memcmp(cra_driver_name, "drbg_nopr_", 10)) { + start = 10; + *pr = false; + } else if (!memcmp(cra_driver_name, "drbg_pr_", 8)) { + start = 8; + } else { + return; + } + + /* remove the first part */ + len = strlen(cra_driver_name) - start; + for (i = 0; ARRAY_SIZE(drbg_cores) > i; i++) { + if (!memcmp(cra_driver_name + start, drbg_cores[i].cra_name, + len)) { + *coreref = i; + return; + } + } +} + +static int drbg_kcapi_init(struct crypto_tfm *tfm) +{ + struct drbg_state *drbg = crypto_tfm_ctx(tfm); + bool pr = false; + int coreref = 0; + + drbg_convert_tfm_core(crypto_tfm_alg_driver_name(tfm), &coreref, &pr); + /* + * when personalization string is needed, the caller must call reset + * and provide the personalization string as seed information + */ + return drbg_instantiate(drbg, NULL, coreref, pr); +} + +static void drbg_kcapi_cleanup(struct crypto_tfm *tfm) +{ + drbg_uninstantiate(crypto_tfm_ctx(tfm)); +} + +/* + * Generate random numbers invoked by the kernel crypto API: + * The API of the kernel crypto API is extended as follows: + * + * If dlen is larger than zero, rdata is interpreted as the output buffer + * where random data is to be stored. + * + * If dlen is zero, rdata is interpreted as a pointer to a struct drbg_gen + * which holds the additional information string that is used for the + * DRBG generation process. The output buffer that is to be used to store + * data is also pointed to by struct drbg_gen. + */ +static int drbg_kcapi_random(struct crypto_rng *tfm, u8 *rdata, + unsigned int dlen) +{ + struct drbg_state *drbg = crypto_rng_ctx(tfm); + if (0 < dlen) { + return drbg_generate_long(drbg, rdata, dlen, NULL); + } else { + struct drbg_gen *data = (struct drbg_gen *)rdata; + struct drbg_string addtl; + /* catch NULL pointer */ + if (!data) + return 0; + drbg_set_testdata(drbg, data->test_data); + /* linked list variable is now local to allow modification */ + drbg_string_fill(&addtl, data->addtl->buf, data->addtl->len); + return drbg_generate_long(drbg, data->outbuf, data->outlen, + &addtl); + } +} + +/* + * Reset the DRBG invoked by the kernel crypto API + * The reset implies a full re-initialization of the DRBG. Similar to the + * generate function of drbg_kcapi_random, this function extends the + * kernel crypto API interface with struct drbg_gen + */ +static int drbg_kcapi_reset(struct crypto_rng *tfm, u8 *seed, unsigned int slen) +{ + struct drbg_state *drbg = crypto_rng_ctx(tfm); + struct crypto_tfm *tfm_base = crypto_rng_tfm(tfm); + bool pr = false; + struct drbg_string seed_string; + int coreref = 0; + + drbg_uninstantiate(drbg); + drbg_convert_tfm_core(crypto_tfm_alg_driver_name(tfm_base), &coreref, + &pr); + if (0 < slen) { + drbg_string_fill(&seed_string, seed, slen); + return drbg_instantiate(drbg, &seed_string, coreref, pr); + } else { + struct drbg_gen *data = (struct drbg_gen *)seed; + /* allow invocation of API call with NULL, 0 */ + if (!data) + return drbg_instantiate(drbg, NULL, coreref, pr); + drbg_set_testdata(drbg, data->test_data); + /* linked list variable is now local to allow modification */ + drbg_string_fill(&seed_string, data->addtl->buf, + data->addtl->len); + return drbg_instantiate(drbg, &seed_string, coreref, pr); + } +} + +/*************************************************************** + * Kernel module: code to load the module + ***************************************************************/ + +/* + * Tests as defined in 11.3.2 in addition to the cipher tests: testing + * of the error handling. + * + * Note: testing of failing seed source as defined in 11.3.2 is not applicable + * as seed source of get_random_bytes does not fail. + * + * Note 2: There is no sensible way of testing the reseed counter + * enforcement, so skip it. + */ +static inline int __init drbg_healthcheck_sanity(void) +{ +#ifdef CONFIG_CRYPTO_FIPS + int len = 0; +#define OUTBUFLEN 16 + unsigned char buf[OUTBUFLEN]; + struct drbg_state *drbg = NULL; + int ret = -EFAULT; + int rc = -EFAULT; + bool pr = false; + int coreref = 0; + struct drbg_string addtl; + size_t max_addtllen, max_request_bytes; + + /* only perform test in FIPS mode */ + if (!fips_enabled) + return 0; + +#ifdef CONFIG_CRYPTO_DRBG_CTR + drbg_convert_tfm_core("drbg_nopr_ctr_aes128", &coreref, &pr); +#elif defined CONFIG_CRYPTO_DRBG_HASH + drbg_convert_tfm_core("drbg_nopr_sha256", &coreref, &pr); +#else + drbg_convert_tfm_core("drbg_nopr_hmac_sha256", &coreref, &pr); +#endif + + drbg = kzalloc(sizeof(struct drbg_state), GFP_KERNEL); + if (!drbg) + return -ENOMEM; + + /* + * if the following tests fail, it is likely that there is a buffer + * overflow as buf is much smaller than the requested or provided + * string lengths -- in case the error handling does not succeed + * we may get an OOPS. And we want to get an OOPS as this is a + * grave bug. + */ + + /* get a valid instance of DRBG for following tests */ + ret = drbg_instantiate(drbg, NULL, coreref, pr); + if (ret) { + rc = ret; + goto outbuf; + } + max_addtllen = drbg_max_addtl(drbg); + max_request_bytes = drbg_max_request_bytes(drbg); + drbg_string_fill(&addtl, buf, max_addtllen + 1); + /* overflow addtllen with additonal info string */ + len = drbg_generate(drbg, buf, OUTBUFLEN, &addtl); + BUG_ON(0 < len); + /* overflow max_bits */ + len = drbg_generate(drbg, buf, (max_request_bytes + 1), NULL); + BUG_ON(0 < len); + drbg_uninstantiate(drbg); + + /* overflow max addtllen with personalization string */ + ret = drbg_instantiate(drbg, &addtl, coreref, pr); + BUG_ON(0 == ret); + /* test uninstantated DRBG */ + len = drbg_generate(drbg, buf, (max_request_bytes + 1), NULL); + BUG_ON(0 < len); + /* all tests passed */ + rc = 0; + + pr_devel("DRBG: Sanity tests for failure code paths successfully " + "completed\n"); + + drbg_uninstantiate(drbg); +outbuf: + kzfree(drbg); + return rc; +#else /* CONFIG_CRYPTO_FIPS */ + return 0; +#endif /* CONFIG_CRYPTO_FIPS */ +} + +static struct crypto_alg drbg_algs[22]; + +/* + * Fill the array drbg_algs used to register the different DRBGs + * with the kernel crypto API. To fill the array, the information + * from drbg_cores[] is used. + */ +static inline void __init drbg_fill_array(struct crypto_alg *alg, + const struct drbg_core *core, int pr) +{ + int pos = 0; + static int priority = 100; + + memset(alg, 0, sizeof(struct crypto_alg)); + memcpy(alg->cra_name, "stdrng", 6); + if (pr) { + memcpy(alg->cra_driver_name, "drbg_pr_", 8); + pos = 8; + } else { + memcpy(alg->cra_driver_name, "drbg_nopr_", 10); + pos = 10; + } + memcpy(alg->cra_driver_name + pos, core->cra_name, + strlen(core->cra_name)); + + alg->cra_priority = priority; + priority++; + /* + * If FIPS mode enabled, the selected DRBG shall have the + * highest cra_priority over other stdrng instances to ensure + * it is selected. + */ + if (fips_enabled) + alg->cra_priority += 200; + + alg->cra_flags = CRYPTO_ALG_TYPE_RNG; + alg->cra_ctxsize = sizeof(struct drbg_state); + alg->cra_type = &crypto_rng_type; + alg->cra_module = THIS_MODULE; + alg->cra_init = drbg_kcapi_init; + alg->cra_exit = drbg_kcapi_cleanup; + alg->cra_u.rng.rng_make_random = drbg_kcapi_random; + alg->cra_u.rng.rng_reset = drbg_kcapi_reset; + alg->cra_u.rng.seedsize = 0; +} + +static int __init drbg_init(void) +{ + unsigned int i = 0; /* pointer to drbg_algs */ + unsigned int j = 0; /* pointer to drbg_cores */ + int ret = -EFAULT; + + ret = drbg_healthcheck_sanity(); + if (ret) + return ret; + + if (ARRAY_SIZE(drbg_cores) * 2 > ARRAY_SIZE(drbg_algs)) { + pr_info("DRBG: Cannot register all DRBG types" + "(slots needed: %zu, slots available: %zu)\n", + ARRAY_SIZE(drbg_cores) * 2, ARRAY_SIZE(drbg_algs)); + return ret; + } + + /* + * each DRBG definition can be used with PR and without PR, thus + * we instantiate each DRBG in drbg_cores[] twice. + * + * As the order of placing them into the drbg_algs array matters + * (the later DRBGs receive a higher cra_priority) we register the + * prediction resistance DRBGs first as the should not be too + * interesting. + */ + for (j = 0; ARRAY_SIZE(drbg_cores) > j; j++, i++) + drbg_fill_array(&drbg_algs[i], &drbg_cores[j], 1); + for (j = 0; ARRAY_SIZE(drbg_cores) > j; j++, i++) + drbg_fill_array(&drbg_algs[i], &drbg_cores[j], 0); + return crypto_register_algs(drbg_algs, (ARRAY_SIZE(drbg_cores) * 2)); +} + +static void __exit drbg_exit(void) +{ + crypto_unregister_algs(drbg_algs, (ARRAY_SIZE(drbg_cores) * 2)); +} + +module_init(drbg_init); +module_exit(drbg_exit); +#ifndef CRYPTO_DRBG_HASH_STRING +#define CRYPTO_DRBG_HASH_STRING "" +#endif +#ifndef CRYPTO_DRBG_HMAC_STRING +#define CRYPTO_DRBG_HMAC_STRING "" +#endif +#ifndef CRYPTO_DRBG_CTR_STRING +#define CRYPTO_DRBG_CTR_STRING "" +#endif +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Stephan Mueller "); +MODULE_DESCRIPTION("NIST SP800-90A Deterministic Random Bit Generator (DRBG) " + "using following cores: " + CRYPTO_DRBG_HASH_STRING + CRYPTO_DRBG_HMAC_STRING + CRYPTO_DRBG_CTR_STRING); diff -uprN linux-2.6.32/crypto/eseqiv.c linux-2.6.32.ovz/crypto/eseqiv.c --- linux-2.6.32/crypto/eseqiv.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/eseqiv.c 2015-03-10 13:23:55.000000000 -0700 @@ -62,20 +62,6 @@ out: skcipher_givcrypt_complete(req, err); } -static void eseqiv_chain(struct scatterlist *head, struct scatterlist *sg, - int chain) -{ - if (chain) { - head->length += sg->length; - sg = scatterwalk_sg_next(sg); - } - - if (sg) - scatterwalk_sg_chain(head, 2, sg); - else - sg_mark_end(head); -} - static int eseqiv_givencrypt(struct skcipher_givcrypt_request *req) { struct crypto_ablkcipher *geniv = skcipher_givcrypt_reqtfm(req); @@ -124,13 +110,13 @@ static int eseqiv_givencrypt(struct skci sg_init_table(reqctx->src, 2); sg_set_buf(reqctx->src, giv, ivsize); - eseqiv_chain(reqctx->src, osrc, vsrc == giv + ivsize); + scatterwalk_crypto_chain(reqctx->src, osrc, vsrc == giv + ivsize, 2); dst = reqctx->src; if (osrc != odst) { sg_init_table(reqctx->dst, 2); sg_set_buf(reqctx->dst, giv, ivsize); - eseqiv_chain(reqctx->dst, odst, vdst == giv + ivsize); + scatterwalk_crypto_chain(reqctx->dst, odst, vdst == giv + ivsize, 2); dst = reqctx->dst; } diff -uprN linux-2.6.32/crypto/gcm.c linux-2.6.32.ovz/crypto/gcm.c --- linux-2.6.32/crypto/gcm.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/gcm.c 2015-03-10 13:23:55.000000000 -0700 @@ -37,6 +37,19 @@ struct crypto_rfc4106_ctx { u8 nonce[4]; }; +struct crypto_rfc4543_ctx { + struct crypto_aead *child; + u8 nonce[4]; +}; + +struct crypto_rfc4543_req_ctx { + u8 auth_tag[16]; + struct scatterlist cipher[1]; + struct scatterlist payload[2]; + struct scatterlist assoc[2]; + struct aead_request subreq; +}; + struct crypto_gcm_ghash_ctx { unsigned int cryptlen; struct scatterlist *src; @@ -1047,6 +1060,257 @@ static struct crypto_template crypto_rfc .module = THIS_MODULE, }; +static inline struct crypto_rfc4543_req_ctx *crypto_rfc4543_reqctx( + struct aead_request *req) +{ + unsigned long align = crypto_aead_alignmask(crypto_aead_reqtfm(req)); + + return (void *)PTR_ALIGN((u8 *)aead_request_ctx(req), align + 1); +} + +static int crypto_rfc4543_setkey(struct crypto_aead *parent, const u8 *key, + unsigned int keylen) +{ + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); + struct crypto_aead *child = ctx->child; + int err; + + if (keylen < 4) + return -EINVAL; + + keylen -= 4; + memcpy(ctx->nonce, key + keylen, 4); + + crypto_aead_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_aead_set_flags(child, crypto_aead_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_aead_setkey(child, key, keylen); + crypto_aead_set_flags(parent, crypto_aead_get_flags(child) & + CRYPTO_TFM_RES_MASK); + + return err; +} + +static int crypto_rfc4543_setauthsize(struct crypto_aead *parent, + unsigned int authsize) +{ + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(parent); + + if (authsize != 16) + return -EINVAL; + + return crypto_aead_setauthsize(ctx->child, authsize); +} + +static struct aead_request *crypto_rfc4543_crypt(struct aead_request *req, + int enc) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_rfc4543_ctx *ctx = crypto_aead_ctx(aead); + struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req); + struct aead_request *subreq = &rctx->subreq; + struct scatterlist *dst = req->dst; + struct scatterlist *cipher = rctx->cipher; + struct scatterlist *payload = rctx->payload; + struct scatterlist *assoc = rctx->assoc; + unsigned int authsize = crypto_aead_authsize(aead); + unsigned int assoclen = req->assoclen; + struct page *dstp; + u8 *vdst; + u8 *iv = PTR_ALIGN((u8 *)(rctx + 1) + crypto_aead_reqsize(ctx->child), + crypto_aead_alignmask(ctx->child) + 1); + + memcpy(iv, ctx->nonce, 4); + memcpy(iv + 4, req->iv, 8); + + /* construct cipher/plaintext */ + if (enc) + memset(rctx->auth_tag, 0, authsize); + else + scatterwalk_map_and_copy(rctx->auth_tag, dst, + req->cryptlen - authsize, + authsize, 0); + + sg_init_one(cipher, rctx->auth_tag, authsize); + + /* construct the aad */ + dstp = sg_page(dst); + vdst = PageHighMem(dstp) ? NULL : page_address(dstp) + dst->offset; + + sg_init_table(payload, 2); + sg_set_buf(payload, req->iv, 8); + scatterwalk_crypto_chain(payload, dst, vdst == req->iv + 8, 2); + assoclen += 8 + req->cryptlen - (enc ? 0 : authsize); + + sg_init_table(assoc, 2); + sg_set_page(assoc, sg_page(req->assoc), req->assoc->length, + req->assoc->offset); + scatterwalk_crypto_chain(assoc, payload, 0, 2); + + aead_request_set_tfm(subreq, ctx->child); + aead_request_set_callback(subreq, req->base.flags, req->base.complete, + req->base.data); + aead_request_set_crypt(subreq, cipher, cipher, enc ? 0 : authsize, iv); + aead_request_set_assoc(subreq, assoc, assoclen); + + return subreq; +} + +static int crypto_rfc4543_encrypt(struct aead_request *req) +{ + struct crypto_aead *aead = crypto_aead_reqtfm(req); + struct crypto_rfc4543_req_ctx *rctx = crypto_rfc4543_reqctx(req); + struct aead_request *subreq; + int err; + + subreq = crypto_rfc4543_crypt(req, 1); + err = crypto_aead_encrypt(subreq); + if (err) + return err; + + scatterwalk_map_and_copy(rctx->auth_tag, req->dst, req->cryptlen, + crypto_aead_authsize(aead), 1); + + return 0; +} + +static int crypto_rfc4543_decrypt(struct aead_request *req) +{ + req = crypto_rfc4543_crypt(req, 0); + + return crypto_aead_decrypt(req); +} + +static int crypto_rfc4543_init_tfm(struct crypto_tfm *tfm) +{ + struct crypto_instance *inst = (void *)tfm->__crt_alg; + struct crypto_aead_spawn *spawn = crypto_instance_ctx(inst); + struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm); + struct crypto_aead *aead; + unsigned long align; + + aead = crypto_spawn_aead(spawn); + if (IS_ERR(aead)) + return PTR_ERR(aead); + + ctx->child = aead; + + align = crypto_aead_alignmask(aead); + align &= ~(crypto_tfm_ctx_alignment() - 1); + tfm->crt_aead.reqsize = sizeof(struct crypto_rfc4543_req_ctx) + + ALIGN(crypto_aead_reqsize(aead), + crypto_tfm_ctx_alignment()) + + align + 16; + + return 0; +} + +static void crypto_rfc4543_exit_tfm(struct crypto_tfm *tfm) +{ + struct crypto_rfc4543_ctx *ctx = crypto_tfm_ctx(tfm); + + crypto_free_aead(ctx->child); +} + +static struct crypto_instance *crypto_rfc4543_alloc(struct rtattr **tb) +{ + struct crypto_attr_type *algt; + struct crypto_instance *inst; + struct crypto_aead_spawn *spawn; + struct crypto_alg *alg; + const char *ccm_name; + int err; + + algt = crypto_get_attr_type(tb); + err = PTR_ERR(algt); + if (IS_ERR(algt)) + return ERR_PTR(err); + + if ((algt->type ^ CRYPTO_ALG_TYPE_AEAD) & algt->mask) + return ERR_PTR(-EINVAL); + + ccm_name = crypto_attr_alg_name(tb[1]); + err = PTR_ERR(ccm_name); + if (IS_ERR(ccm_name)) + return ERR_PTR(err); + + inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); + if (!inst) + return ERR_PTR(-ENOMEM); + + spawn = crypto_instance_ctx(inst); + crypto_set_aead_spawn(spawn, inst); + err = crypto_grab_aead(spawn, ccm_name, 0, + crypto_requires_sync(algt->type, algt->mask)); + if (err) + goto out_free_inst; + + alg = crypto_aead_spawn_alg(spawn); + + err = -EINVAL; + + /* We only support 16-byte blocks. */ + if (alg->cra_aead.ivsize != 16) + goto out_drop_alg; + + /* Not a stream cipher? */ + if (alg->cra_blocksize != 1) + goto out_drop_alg; + + err = -ENAMETOOLONG; + if (snprintf(inst->alg.cra_name, CRYPTO_MAX_ALG_NAME, + "rfc4543(%s)", alg->cra_name) >= CRYPTO_MAX_ALG_NAME || + snprintf(inst->alg.cra_driver_name, CRYPTO_MAX_ALG_NAME, + "rfc4543(%s)", alg->cra_driver_name) >= + CRYPTO_MAX_ALG_NAME) + goto out_drop_alg; + + inst->alg.cra_flags = CRYPTO_ALG_TYPE_AEAD; + inst->alg.cra_flags |= alg->cra_flags & CRYPTO_ALG_ASYNC; + inst->alg.cra_priority = alg->cra_priority; + inst->alg.cra_blocksize = 1; + inst->alg.cra_alignmask = alg->cra_alignmask; + inst->alg.cra_type = &crypto_nivaead_type; + + inst->alg.cra_aead.ivsize = 8; + inst->alg.cra_aead.maxauthsize = 16; + + inst->alg.cra_ctxsize = sizeof(struct crypto_rfc4543_ctx); + + inst->alg.cra_init = crypto_rfc4543_init_tfm; + inst->alg.cra_exit = crypto_rfc4543_exit_tfm; + + inst->alg.cra_aead.setkey = crypto_rfc4543_setkey; + inst->alg.cra_aead.setauthsize = crypto_rfc4543_setauthsize; + inst->alg.cra_aead.encrypt = crypto_rfc4543_encrypt; + inst->alg.cra_aead.decrypt = crypto_rfc4543_decrypt; + + inst->alg.cra_aead.geniv = "seqiv"; + +out: + return inst; + +out_drop_alg: + crypto_drop_aead(spawn); +out_free_inst: + kfree(inst); + inst = ERR_PTR(err); + goto out; +} + +static void crypto_rfc4543_free(struct crypto_instance *inst) +{ + crypto_drop_spawn(crypto_instance_ctx(inst)); + kfree(inst); +} + +static struct crypto_template crypto_rfc4543_tmpl = { + .name = "rfc4543", + .alloc = crypto_rfc4543_alloc, + .free = crypto_rfc4543_free, + .module = THIS_MODULE, +}; + static int __init crypto_gcm_module_init(void) { int err; @@ -1067,8 +1331,14 @@ static int __init crypto_gcm_module_init if (err) goto out_undo_gcm; + err = crypto_register_template(&crypto_rfc4543_tmpl); + if (err) + goto out_undo_rfc4106; + return 0; +out_undo_rfc4106: + crypto_unregister_template(&crypto_rfc4106_tmpl); out_undo_gcm: crypto_unregister_template(&crypto_gcm_tmpl); out_undo_base: @@ -1081,6 +1351,7 @@ out: static void __exit crypto_gcm_module_exit(void) { kfree(gcm_zeroes); + crypto_unregister_template(&crypto_rfc4543_tmpl); crypto_unregister_template(&crypto_rfc4106_tmpl); crypto_unregister_template(&crypto_gcm_tmpl); crypto_unregister_template(&crypto_gcm_base_tmpl); @@ -1094,3 +1365,4 @@ MODULE_DESCRIPTION("Galois/Counter Mode" MODULE_AUTHOR("Mikko Herranen "); MODULE_ALIAS("gcm_base"); MODULE_ALIAS("rfc4106"); +MODULE_ALIAS("rfc4543"); diff -uprN linux-2.6.32/crypto/gf128mul.c linux-2.6.32.ovz/crypto/gf128mul.c --- linux-2.6.32/crypto/gf128mul.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/gf128mul.c 2015-03-10 13:24:07.000000000 -0700 @@ -182,7 +182,7 @@ void gf128mul_lle(be128 *r, const be128 for (i = 0; i < 7; ++i) gf128mul_x_lle(&p[i + 1], &p[i]); - memset(r, 0, sizeof(r)); + memset(r, 0, sizeof(*r)); for (i = 0;;) { u8 ch = ((u8 *)b)[15 - i]; @@ -220,7 +220,7 @@ void gf128mul_bbe(be128 *r, const be128 for (i = 0; i < 7; ++i) gf128mul_x_bbe(&p[i + 1], &p[i]); - memset(r, 0, sizeof(r)); + memset(r, 0, sizeof(*r)); for (i = 0;;) { u8 ch = ((u8 *)b)[i]; diff -uprN linux-2.6.32/crypto/ghash-generic.c linux-2.6.32.ovz/crypto/ghash-generic.c --- linux-2.6.32/crypto/ghash-generic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/ghash-generic.c 2015-03-10 13:22:51.000000000 -0700 @@ -67,6 +67,9 @@ static int ghash_update(struct shash_des struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *dst = dctx->buffer; + if (!ctx->gf128) + return -ENOKEY; + if (dctx->bytes) { int n = min(srclen, dctx->bytes); u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes); @@ -119,6 +122,9 @@ static int ghash_final(struct shash_desc struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm); u8 *buf = dctx->buffer; + if (!ctx->gf128) + return -ENOKEY; + ghash_flush(ctx, dctx); memcpy(dst, buf, GHASH_BLOCK_SIZE); diff -uprN linux-2.6.32/crypto/internal.h linux-2.6.32.ovz/crypto/internal.h --- linux-2.6.32/crypto/internal.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/internal.h 2015-03-10 13:23:57.000000000 -0700 @@ -83,7 +83,6 @@ void crypto_exit_compress_ops(struct cry struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask); void crypto_larval_kill(struct crypto_alg *alg); struct crypto_alg *crypto_larval_lookup(const char *name, u32 type, u32 mask); -void crypto_larval_error(const char *name, u32 type, u32 mask); void crypto_alg_tested(const char *name, int err); void crypto_shoot_alg(struct crypto_alg *alg); @@ -101,6 +100,12 @@ int crypto_register_notifier(struct noti int crypto_unregister_notifier(struct notifier_block *nb); int crypto_probing_notify(unsigned long val, void *v); +static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) +{ + atomic_inc(&alg->cra_refcnt); + return alg; +} + static inline void crypto_alg_put(struct crypto_alg *alg) { if (atomic_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy) diff -uprN linux-2.6.32/crypto/Kconfig linux-2.6.32.ovz/crypto/Kconfig --- linux-2.6.32/crypto/Kconfig 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/Kconfig 2015-06-23 04:16:16.000000000 -0700 @@ -23,7 +23,8 @@ comment "Crypto core or helper" config CRYPTO_FIPS bool "FIPS 200 compliance" - depends on CRYPTO_ANSI_CPRNG + depends on CRYPTO_ANSI_CPRNG || CRYTPO_DRBG + depends on MODULE_SIG help This options enables the fips boot option which is required if you want to system to operate in a FIPS 200 @@ -145,6 +146,16 @@ config CRYPTO_TEST help Quick & dirty crypto test module. +config CRYPTO_ABLK_HELPER_X86 + tristate + depends on X86 + select CRYPTO_CRYPTD + +config CRYPTO_GLUE_HELPER_X86 + tristate + depends on X86 + select CRYPTO_ALGAPI + comment "Authenticated Encryption with Associated Data" config CRYPTO_CCM @@ -212,8 +223,7 @@ config CRYPTO_ECB the input block by block. config CRYPTO_LRW - tristate "LRW support (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "LRW support" select CRYPTO_BLKCIPHER select CRYPTO_MANAGER select CRYPTO_GF128MUL @@ -233,8 +243,7 @@ config CRYPTO_PCBC This block cipher algorithm is required for RxRPC. config CRYPTO_XTS - tristate "XTS support (EXPERIMENTAL)" - depends on EXPERIMENTAL + tristate "XTS support" select CRYPTO_BLKCIPHER select CRYPTO_MANAGER select CRYPTO_GF128MUL @@ -243,11 +252,6 @@ config CRYPTO_XTS key size 256, 384 or 512 bits. This implementation currently can't handle a sectorsize which is not a multiple of 16 bytes. -config CRYPTO_FPU - tristate - select CRYPTO_BLKCIPHER - select CRYPTO_MANAGER - comment "Hash modes" config CRYPTO_HMAC @@ -303,6 +307,25 @@ config CRYPTO_CRC32C_INTEL gain performance compared with software implementation. Module will be crc32c-intel. +config CRYPTO_CRCT10DIF + tristate "CRCT10DIF algorithm" + select CRYPTO_HASH + help + CRC T10 Data Integrity Field computation is being cast as + a crypto transform. This allows for faster crc t10 diff + transforms to be used if they are available. + +config CRYPTO_CRCT10DIF_PCLMUL + tristate "CRCT10DIF PCLMULQDQ hardware acceleration" + depends on X86 && 64BIT && CRYPTO_CRCT10DIF + select CRYPTO_HASH + help + For x86_64 processors with SSE4.2 and PCLMULQDQ supported, + CRC T10 DIF PCLMULQDQ computation can be hardware + accelerated PCLMULQDQ instruction. This option will create + 'crct10dif-plcmul' module, which is faster when computing the + crct10dif checksum as compared with the generic table implementation. + config CRYPTO_GHASH tristate "GHASH digest algorithm" select CRYPTO_SHASH @@ -391,6 +414,16 @@ config CRYPTO_SHA1 help SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2). +config CRYPTO_SHA1_SSSE3 + tristate "SHA1 digest algorithm (SSSE3/AVX)" + depends on X86 && 64BIT + select CRYPTO_SHA1 + select CRYPTO_HASH + help + SHA-1 secure hash standard (FIPS 180-1/DFIPS 180-2) implemented + using Supplemental SSE3 (SSSE3) instructions or Advanced Vector + Extensions (AVX), when available. + config CRYPTO_SHA256 tristate "SHA224 and SHA256 digest algorithm" select CRYPTO_HASH @@ -440,6 +473,15 @@ config CRYPTO_WP512 See also: +config CRYPTO_GHASH_CLMUL_NI_INTEL + tristate "GHASH digest algorithm (CLMUL-NI accelerated)" + depends on (X86 || UML_X86) && 64BIT + select CRYPTO_SHASH + select CRYPTO_CRYPTD + help + GHASH is message digest algorithm for GCM (Galois/Counter Mode). + The implementation is accelerated by CLMUL-NI of Intel. + comment "Ciphers" config CRYPTO_AES @@ -508,11 +550,15 @@ config CRYPTO_AES_X86_64 config CRYPTO_AES_NI_INTEL tristate "AES cipher algorithms (AES-NI)" - depends on (X86 || UML_X86) && 64BIT - select CRYPTO_AES_X86_64 + depends on (X86 || UML_X86) + select CRYPTO_AES_X86_64 if 64BIT + select CRYPTO_AES_586 if !64BIT select CRYPTO_CRYPTD + select CRYPTO_ABLK_HELPER_X86 select CRYPTO_ALGAPI - select CRYPTO_FPU + select CRYPTO_GLUE_HELPER_X86 if 64BIT + select CRYPTO_LRW + select CRYPTO_XTS help Use Intel AES-NI instructions for AES algorithm. @@ -532,9 +578,10 @@ config CRYPTO_AES_NI_INTEL See for more information. - In addition to AES cipher algorithm support, the - acceleration for some popular block cipher mode is supported - too, including ECB, CBC, CTR, LRW, PCBC, XTS. + In addition to AES cipher algorithm support, the acceleration + for some popular block cipher mode is supported too, including + ECB, CBC, LRW, PCBC, XTS. The 64 bit version has additional + acceleration for CTR. config CRYPTO_ANUBIS tristate "Anubis cipher algorithm" @@ -810,6 +857,59 @@ config CRYPTO_ANSI_CPRNG ANSI X9.31 A.2.4. Not this option must be enabled if CRYPTO_FIPS is selected +menuconfig CRYPTO_DRBG_MENU + tristate "NIST SP800-90A DRBG" + help + NIST SP800-90A compliant DRBG. In the following submenu, one or + more of the DRBG types must be selected. + +if CRYPTO_DRBG_MENU + +config CRYPTO_DRBG_HMAC + bool "Enable HMAC DRBG" + default y + select CRYPTO_HMAC + help + Enable the HMAC DRBG variant as defined in NIST SP800-90A. + +config CRYPTO_DRBG_HASH + bool "Enable Hash DRBG" + select CRYPTO_HASH + help + Enable the Hash DRBG variant as defined in NIST SP800-90A. + +config CRYPTO_DRBG_CTR + bool "Enable CTR DRBG" + select CRYPTO_AES + help + Enable the CTR DRBG variant as defined in NIST SP800-90A. + +config CRYPTO_DRBG + tristate + default CRYPTO_DRBG_MENU if (CRYPTO_DRBG_HMAC || CRYPTO_DRBG_HASH || CRYPTO_DRBG_CTR) + select CRYPTO_RNG + +endif # if CRYPTO_DRBG_MENU + +config CRYPTO_MPILIB + bool "Multiprecision maths library (EXPERIMENTAL)" + depends on CRYPTO + help + Multiprecision maths library from GnuPG + +config CRYPTO_SIGNATURE + bool "In-kernel signature checker (EXPERIMENTAL)" + depends on CRYPTO + help + Signature checker (used for module sig checking). + +config CRYPTO_SIGNATURE_DSA + bool "Handle DSA signatures (EXPERIMENTAL)" + depends on CRYPTO_SIGNATURE + select CRYPTO_MPILIB + help + DSA Signature checker. + source "drivers/crypto/Kconfig" endif # if CRYPTO diff -uprN linux-2.6.32/crypto/lrw.c linux-2.6.32.ovz/crypto/lrw.c --- linux-2.6.32/crypto/lrw.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/lrw.c 2015-03-10 13:23:55.000000000 -0700 @@ -3,7 +3,7 @@ * * Copyright (c) 2006 Rik Snel * - * Based om ecb.c + * Based on ecb.c * Copyright (c) 2006 Herbert Xu * * This program is free software; you can redistribute it and/or modify it @@ -16,6 +16,7 @@ * http://www.mail-archive.com/stds-p1619@listserv.ieee.org/msg00173.html * * The test vectors are included in the testing module tcrypt.[ch] */ + #include #include #include @@ -26,21 +27,11 @@ #include #include +#include struct priv { struct crypto_cipher *child; - /* optimizes multiplying a random (non incrementing, as at the - * start of a new sector) value with key2, we could also have - * used 4k optimization tables or no optimization at all. In the - * latter case we would have to store key2 here */ - struct gf128mul_64k *table; - /* stores: - * key2*{ 0,0,...0,0,0,0,1 }, key2*{ 0,0,...0,0,0,1,1 }, - * key2*{ 0,0,...0,0,1,1,1 }, key2*{ 0,0,...0,1,1,1,1 } - * key2*{ 0,0,...1,1,1,1,1 }, etc - * needed for optimized multiplication of incrementing values - * with key2 */ - be128 mulinc[128]; + struct lrw_table_ctx table; }; static inline void setbit128_bbe(void *b, int bit) @@ -54,28 +45,16 @@ static inline void setbit128_bbe(void *b ), b); } -static int setkey(struct crypto_tfm *parent, const u8 *key, - unsigned int keylen) +int lrw_init_table(struct lrw_table_ctx *ctx, const u8 *tweak) { - struct priv *ctx = crypto_tfm_ctx(parent); - struct crypto_cipher *child = ctx->child; - int err, i; be128 tmp = { 0 }; - int bsize = crypto_cipher_blocksize(child); - - crypto_cipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); - crypto_cipher_set_flags(child, crypto_tfm_get_flags(parent) & - CRYPTO_TFM_REQ_MASK); - if ((err = crypto_cipher_setkey(child, key, keylen - bsize))) - return err; - crypto_tfm_set_flags(parent, crypto_cipher_get_flags(child) & - CRYPTO_TFM_RES_MASK); + int i; if (ctx->table) gf128mul_free_64k(ctx->table); /* initialize multiplication table for Key2 */ - ctx->table = gf128mul_init_64k_bbe((be128 *)(key + keylen - bsize)); + ctx->table = gf128mul_init_64k_bbe((be128 *)tweak); if (!ctx->table) return -ENOMEM; @@ -88,6 +67,34 @@ static int setkey(struct crypto_tfm *par return 0; } +EXPORT_SYMBOL_GPL(lrw_init_table); + +void lrw_free_table(struct lrw_table_ctx *ctx) +{ + if (ctx->table) + gf128mul_free_64k(ctx->table); +} +EXPORT_SYMBOL_GPL(lrw_free_table); + +static int setkey(struct crypto_tfm *parent, const u8 *key, + unsigned int keylen) +{ + struct priv *ctx = crypto_tfm_ctx(parent); + struct crypto_cipher *child = ctx->child; + int err, bsize = LRW_BLOCK_SIZE; + const u8 *tweak = key + keylen - bsize; + + crypto_cipher_clear_flags(child, CRYPTO_TFM_REQ_MASK); + crypto_cipher_set_flags(child, crypto_tfm_get_flags(parent) & + CRYPTO_TFM_REQ_MASK); + err = crypto_cipher_setkey(child, key, keylen - bsize); + if (err) + return err; + crypto_tfm_set_flags(parent, crypto_cipher_get_flags(child) & + CRYPTO_TFM_RES_MASK); + + return lrw_init_table(&ctx->table, tweak); +} struct sinfo { be128 t; @@ -134,7 +141,7 @@ static int crypt(struct blkcipher_desc * { int err; unsigned int avail; - const int bs = crypto_cipher_blocksize(ctx->child); + const int bs = LRW_BLOCK_SIZE; struct sinfo s = { .tfm = crypto_cipher_tfm(ctx->child), .fn = fn @@ -155,7 +162,7 @@ static int crypt(struct blkcipher_desc * s.t = *iv; /* T <- I*Key2 */ - gf128mul_64k_bbe(&s.t, ctx->table); + gf128mul_64k_bbe(&s.t, ctx->table.table); goto first; @@ -163,7 +170,8 @@ static int crypt(struct blkcipher_desc * do { /* T <- I*Key2, using the optimization * discussed in the specification */ - be128_xor(&s.t, &s.t, &ctx->mulinc[get_index128(iv)]); + be128_xor(&s.t, &s.t, + &ctx->table.mulinc[get_index128(iv)]); inc(iv); first: @@ -206,6 +214,85 @@ static int decrypt(struct blkcipher_desc crypto_cipher_alg(ctx->child)->cia_decrypt); } +int lrw_crypt(struct blkcipher_desc *desc, struct scatterlist *sdst, + struct scatterlist *ssrc, unsigned int nbytes, + struct lrw_crypt_req *req) +{ + const unsigned int bsize = LRW_BLOCK_SIZE; + const unsigned int max_blks = req->tbuflen / bsize; + struct lrw_table_ctx *ctx = req->table_ctx; + struct blkcipher_walk walk; + unsigned int nblocks; + be128 *iv, *src, *dst, *t; + be128 *t_buf = req->tbuf; + int err, i; + + BUG_ON(max_blks < 1); + + blkcipher_walk_init(&walk, sdst, ssrc, nbytes); + + err = blkcipher_walk_virt(desc, &walk); + nbytes = walk.nbytes; + if (!nbytes) + return err; + + nblocks = min(walk.nbytes / bsize, max_blks); + src = (be128 *)walk.src.virt.addr; + dst = (be128 *)walk.dst.virt.addr; + + /* calculate first value of T */ + iv = (be128 *)walk.iv; + t_buf[0] = *iv; + + /* T <- I*Key2 */ + gf128mul_64k_bbe(&t_buf[0], ctx->table); + + i = 0; + goto first; + + for (;;) { + do { + for (i = 0; i < nblocks; i++) { + /* T <- I*Key2, using the optimization + * discussed in the specification */ + be128_xor(&t_buf[i], t, + &ctx->mulinc[get_index128(iv)]); + inc(iv); +first: + t = &t_buf[i]; + + /* PP <- T xor P */ + be128_xor(dst + i, t, src + i); + } + + /* CC <- E(Key2,PP) */ + req->crypt_fn(req->crypt_ctx, (u8 *)dst, + nblocks * bsize); + + /* C <- T xor CC */ + for (i = 0; i < nblocks; i++) + be128_xor(dst + i, dst + i, &t_buf[i]); + + src += nblocks; + dst += nblocks; + nbytes -= nblocks * bsize; + nblocks = min(nbytes / bsize, max_blks); + } while (nblocks > 0); + + err = blkcipher_walk_done(desc, &walk, nbytes); + nbytes = walk.nbytes; + if (!nbytes) + break; + + nblocks = min(nbytes / bsize, max_blks); + src = (be128 *)walk.src.virt.addr; + dst = (be128 *)walk.dst.virt.addr; + } + + return err; +} +EXPORT_SYMBOL_GPL(lrw_crypt); + static int init_tfm(struct crypto_tfm *tfm) { struct crypto_cipher *cipher; @@ -218,7 +305,7 @@ static int init_tfm(struct crypto_tfm *t if (IS_ERR(cipher)) return PTR_ERR(cipher); - if (crypto_cipher_blocksize(cipher) != 16) { + if (crypto_cipher_blocksize(cipher) != LRW_BLOCK_SIZE) { *flags |= CRYPTO_TFM_RES_BAD_BLOCK_LEN; return -EINVAL; } @@ -230,8 +317,8 @@ static int init_tfm(struct crypto_tfm *t static void exit_tfm(struct crypto_tfm *tfm) { struct priv *ctx = crypto_tfm_ctx(tfm); - if (ctx->table) - gf128mul_free_64k(ctx->table); + + lrw_free_table(&ctx->table); crypto_free_cipher(ctx->child); } diff -uprN linux-2.6.32/crypto/Makefile linux-2.6.32.ovz/crypto/Makefile --- linux-2.6.32/crypto/Makefile 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/Makefile 2015-03-10 13:24:13.000000000 -0700 @@ -78,12 +78,16 @@ obj-$(CONFIG_CRYPTO_ZLIB) += zlib.o obj-$(CONFIG_CRYPTO_MICHAEL_MIC) += michael_mic.o obj-$(CONFIG_CRYPTO_CRC32C) += crc32c.o obj-$(CONFIG_CRYPTO_AUTHENC) += authenc.o +obj-$(CONFIG_CRYPTO_CRCT10DIF) += crct10dif.o obj-$(CONFIG_CRYPTO_LZO) += lzo.o obj-$(CONFIG_CRYPTO_RNG2) += rng.o obj-$(CONFIG_CRYPTO_RNG2) += krng.o obj-$(CONFIG_CRYPTO_ANSI_CPRNG) += ansi_cprng.o +obj-$(CONFIG_CRYPTO_DRBG) += drbg.o obj-$(CONFIG_CRYPTO_TEST) += tcrypt.o obj-$(CONFIG_CRYPTO_GHASH) += ghash-generic.o +obj-$(CONFIG_CRYPTO_MPILIB) += mpi/ +obj-$(CONFIG_CRYPTO_SIGNATURE) += signature/ # # generic algorithms and the async_tx api diff -uprN linux-2.6.32/crypto/md5.c linux-2.6.32.ovz/crypto/md5.c --- linux-2.6.32/crypto/md5.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/md5.c 2015-03-10 13:24:13.000000000 -0700 @@ -16,114 +16,14 @@ * */ #include +#include #include #include #include #include +#include #include -#define MD5_DIGEST_SIZE 16 -#define MD5_HMAC_BLOCK_SIZE 64 -#define MD5_BLOCK_WORDS 16 -#define MD5_HASH_WORDS 4 - -#define F1(x, y, z) (z ^ (x & (y ^ z))) -#define F2(x, y, z) F1(z, x, y) -#define F3(x, y, z) (x ^ y ^ z) -#define F4(x, y, z) (y ^ (x | ~z)) - -#define MD5STEP(f, w, x, y, z, in, s) \ - (w += f(x, y, z) + in, w = (w<>(32-s)) + x) - -struct md5_ctx { - u32 hash[MD5_HASH_WORDS]; - u32 block[MD5_BLOCK_WORDS]; - u64 byte_count; -}; - -static void md5_transform(u32 *hash, u32 const *in) -{ - u32 a, b, c, d; - - a = hash[0]; - b = hash[1]; - c = hash[2]; - d = hash[3]; - - MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); - MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); - MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); - MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); - MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); - MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); - MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); - MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); - MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); - MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); - MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); - MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); - MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); - MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); - MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); - MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); - - MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); - MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); - MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); - MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); - MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); - MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); - MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); - MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); - MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); - MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); - MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); - MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); - MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); - MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); - MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); - MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); - - MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); - MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); - MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); - MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); - MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); - MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); - MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); - MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); - MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); - MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); - MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); - MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); - MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); - MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); - MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); - MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); - - MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); - MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); - MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); - MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); - MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); - MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); - MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); - MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); - MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); - MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); - MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); - MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); - MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); - MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); - MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); - MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); - - hash[0] += a; - hash[1] += b; - hash[2] += c; - hash[3] += d; -} - /* XXX: this stuff can be optimized */ static inline void le32_to_cpu_array(u32 *buf, unsigned int words) { @@ -141,7 +41,7 @@ static inline void cpu_to_le32_array(u32 } } -static inline void md5_transform_helper(struct md5_ctx *ctx) +static inline void md5_transform_helper(struct md5_state *ctx) { le32_to_cpu_array(ctx->block, sizeof(ctx->block) / sizeof(u32)); md5_transform(ctx->hash, ctx->block); @@ -149,7 +49,7 @@ static inline void md5_transform_helper( static int md5_init(struct shash_desc *desc) { - struct md5_ctx *mctx = shash_desc_ctx(desc); + struct md5_state *mctx = shash_desc_ctx(desc); mctx->hash[0] = 0x67452301; mctx->hash[1] = 0xefcdab89; @@ -162,7 +62,7 @@ static int md5_init(struct shash_desc *d static int md5_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - struct md5_ctx *mctx = shash_desc_ctx(desc); + struct md5_state *mctx = shash_desc_ctx(desc); const u32 avail = sizeof(mctx->block) - (mctx->byte_count & 0x3f); mctx->byte_count += len; @@ -194,7 +94,7 @@ static int md5_update(struct shash_desc static int md5_final(struct shash_desc *desc, u8 *out) { - struct md5_ctx *mctx = shash_desc_ctx(desc); + struct md5_state *mctx = shash_desc_ctx(desc); const unsigned int offset = mctx->byte_count & 0x3f; char *p = (char *)mctx->block + offset; int padding = 56 - (offset + 1); @@ -220,12 +120,31 @@ static int md5_final(struct shash_desc * return 0; } +static int md5_export(struct shash_desc *desc, void *out) +{ + struct md5_state *ctx = shash_desc_ctx(desc); + + memcpy(out, ctx, sizeof(*ctx)); + return 0; +} + +static int md5_import(struct shash_desc *desc, const void *in) +{ + struct md5_state *ctx = shash_desc_ctx(desc); + + memcpy(ctx, in, sizeof(*ctx)); + return 0; +} + static struct shash_alg alg = { .digestsize = MD5_DIGEST_SIZE, .init = md5_init, .update = md5_update, .final = md5_final, - .descsize = sizeof(struct md5_ctx), + .export = md5_export, + .import = md5_import, + .descsize = sizeof(struct md5_state), + .statesize = sizeof(struct md5_state), .base = { .cra_name = "md5", .cra_flags = CRYPTO_ALG_TYPE_SHASH, diff -uprN linux-2.6.32/crypto/mpi/generic_mpi-asm-defs.h linux-2.6.32.ovz/crypto/mpi/generic_mpi-asm-defs.h --- linux-2.6.32/crypto/mpi/generic_mpi-asm-defs.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_mpi-asm-defs.h 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,10 @@ +/* This file defines some basic constants for the MPI machinery. We + * need to define the types on a per-CPU basis, so it is done with + * this file here. */ +#define BYTES_PER_MPI_LIMB (SIZEOF_UNSIGNED_LONG) + + + + + + diff -uprN linux-2.6.32/crypto/mpi/generic_mpih-add1.c linux-2.6.32.ovz/crypto/mpi/generic_mpih-add1.c --- linux-2.6.32/crypto/mpi/generic_mpih-add1.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_mpih-add1.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,62 @@ +/* mpihelp-add_1.c - MPI helper functions + * Copyright (C) 1994, 1996, 1997, 1998, + * 2000 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" +#include "longlong.h" + +mpi_limb_t +mpihelp_add_n( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_ptr_t s2_ptr, mpi_size_t size) +{ + mpi_limb_t x, y, cy; + mpi_size_t j; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + s2_ptr -= j; + res_ptr -= j; + + cy = 0; + do { + y = s2_ptr[j]; + x = s1_ptr[j]; + y += cy; /* add previous carry to one addend */ + cy = y < cy; /* get out carry from that addition */ + y += x; /* add other addend */ + cy += y < x; /* get out carry from that add, combine */ + res_ptr[j] = y; + } while( ++j ); + + return cy; +} + diff -uprN linux-2.6.32/crypto/mpi/generic_mpih-lshift.c linux-2.6.32.ovz/crypto/mpi/generic_mpih-lshift.c --- linux-2.6.32/crypto/mpi/generic_mpih-lshift.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_mpih-lshift.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,66 @@ +/* mpihelp-lshift.c - MPI helper functions + * Copyright (C) 1994, 1996, 1998, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" + +/* Shift U (pointed to by UP and USIZE digits long) CNT bits to the left + * and store the USIZE least significant digits of the result at WP. + * Return the bits shifted out from the most significant digit. + * + * Argument constraints: + * 1. 0 < CNT < BITS_PER_MP_LIMB + * 2. If the result is to be written over the input, WP must be >= UP. + */ + +mpi_limb_t +mpihelp_lshift( mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, + unsigned int cnt) +{ + mpi_limb_t high_limb, low_limb; + unsigned sh_1, sh_2; + mpi_size_t i; + mpi_limb_t retval; + + sh_1 = cnt; + wp += 1; + sh_2 = BITS_PER_MPI_LIMB - sh_1; + i = usize - 1; + low_limb = up[i]; + retval = low_limb >> sh_2; + high_limb = low_limb; + while( --i >= 0 ) { + low_limb = up[i]; + wp[i] = (high_limb << sh_1) | (low_limb >> sh_2); + high_limb = low_limb; + } + wp[i] = high_limb << sh_1; + + return retval; +} + + diff -uprN linux-2.6.32/crypto/mpi/generic_mpih-mul1.c linux-2.6.32.ovz/crypto/mpi/generic_mpih-mul1.c --- linux-2.6.32/crypto/mpi/generic_mpih-mul1.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_mpih-mul1.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,58 @@ +/* mpihelp-mul_1.c - MPI helper functions + * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" +#include "longlong.h" + +mpi_limb_t +mpihelp_mul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, + mpi_limb_t s2_limb) +{ + mpi_limb_t cy_limb; + mpi_size_t j; + mpi_limb_t prod_high, prod_low; + + /* The loop counter and index J goes from -S1_SIZE to -1. This way + * the loop becomes faster. */ + j = -s1_size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + res_ptr -= j; + + cy_limb = 0; + do { + umul_ppmm( prod_high, prod_low, s1_ptr[j], s2_limb ); + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb?1:0) + prod_high; + res_ptr[j] = prod_low; + } while( ++j ); + + return cy_limb; +} + diff -uprN linux-2.6.32/crypto/mpi/generic_mpih-mul2.c linux-2.6.32.ovz/crypto/mpi/generic_mpih-mul2.c --- linux-2.6.32/crypto/mpi/generic_mpih-mul2.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_mpih-mul2.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,63 @@ +/* mpihelp-mul_2.c - MPI helper functions + * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" +#include "longlong.h" + + +mpi_limb_t +mpihelp_addmul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb) +{ + mpi_limb_t cy_limb; + mpi_size_t j; + mpi_limb_t prod_high, prod_low; + mpi_limb_t x; + + /* The loop counter and index J goes from -SIZE to -1. This way + * the loop becomes faster. */ + j = -s1_size; + res_ptr -= j; + s1_ptr -= j; + + cy_limb = 0; + do { + umul_ppmm( prod_high, prod_low, s1_ptr[j], s2_limb ); + + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb?1:0) + prod_high; + + x = res_ptr[j]; + prod_low = x + prod_low; + cy_limb += prod_low < x?1:0; + res_ptr[j] = prod_low; + } while ( ++j ); + return cy_limb; +} + + diff -uprN linux-2.6.32/crypto/mpi/generic_mpih-mul3.c linux-2.6.32.ovz/crypto/mpi/generic_mpih-mul3.c --- linux-2.6.32/crypto/mpi/generic_mpih-mul3.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_mpih-mul3.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,64 @@ +/* mpihelp-mul_3.c - MPI helper functions + * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" +#include "longlong.h" + + +mpi_limb_t +mpihelp_submul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb) +{ + mpi_limb_t cy_limb; + mpi_size_t j; + mpi_limb_t prod_high, prod_low; + mpi_limb_t x; + + /* The loop counter and index J goes from -SIZE to -1. This way + * the loop becomes faster. */ + j = -s1_size; + res_ptr -= j; + s1_ptr -= j; + + cy_limb = 0; + do { + umul_ppmm( prod_high, prod_low, s1_ptr[j], s2_limb); + + prod_low += cy_limb; + cy_limb = (prod_low < cy_limb?1:0) + prod_high; + + x = res_ptr[j]; + prod_low = x - prod_low; + cy_limb += prod_low > x?1:0; + res_ptr[j] = prod_low; + } while( ++j ); + + return cy_limb; +} + + diff -uprN linux-2.6.32/crypto/mpi/generic_mpih-rshift.c linux-2.6.32.ovz/crypto/mpi/generic_mpih-rshift.c --- linux-2.6.32/crypto/mpi/generic_mpih-rshift.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_mpih-rshift.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,65 @@ +/* mpih-rshift.c - MPI helper functions + * Copyright (C) 1994, 1996, 1998, 1999, + * 2000, 2001 Free Software Foundation, Inc. + * + * This file is part of GNUPG + * + * GNUPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GNUPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" + + +/* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right + * and store the USIZE least significant limbs of the result at WP. + * The bits shifted out to the right are returned. + * + * Argument constraints: + * 1. 0 < CNT < BITS_PER_MP_LIMB + * 2. If the result is to be written over the input, WP must be <= UP. + */ + +mpi_limb_t +mpihelp_rshift( mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned cnt) +{ + mpi_limb_t high_limb, low_limb; + unsigned sh_1, sh_2; + mpi_size_t i; + mpi_limb_t retval; + + sh_1 = cnt; + wp -= 1; + sh_2 = BITS_PER_MPI_LIMB - sh_1; + high_limb = up[0]; + retval = high_limb << sh_2; + low_limb = high_limb; + for( i=1; i < usize; i++) { + high_limb = up[i]; + wp[i] = (low_limb >> sh_1) | (high_limb << sh_2); + low_limb = high_limb; + } + wp[i] = low_limb >> sh_1; + + return retval; +} + diff -uprN linux-2.6.32/crypto/mpi/generic_mpih-sub1.c linux-2.6.32.ovz/crypto/mpi/generic_mpih-sub1.c --- linux-2.6.32/crypto/mpi/generic_mpih-sub1.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_mpih-sub1.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,62 @@ +/* mpihelp-add_2.c - MPI helper functions + * Copyright (C) 1994, 1996, 1997, 1998, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" +#include "longlong.h" + +mpi_limb_t +mpihelp_sub_n( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_ptr_t s2_ptr, mpi_size_t size) +{ + mpi_limb_t x, y, cy; + mpi_size_t j; + + /* The loop counter and index J goes from -SIZE to -1. This way + the loop becomes faster. */ + j = -size; + + /* Offset the base pointers to compensate for the negative indices. */ + s1_ptr -= j; + s2_ptr -= j; + res_ptr -= j; + + cy = 0; + do { + y = s2_ptr[j]; + x = s1_ptr[j]; + y += cy; /* add previous carry to subtrahend */ + cy = y < cy; /* get out carry from that addition */ + y = x - y; /* main subtract */ + cy += y > x; /* get out carry from the subtract, combine */ + res_ptr[j] = y; + } while( ++j ); + + return cy; +} + + diff -uprN linux-2.6.32/crypto/mpi/generic_udiv-w-sdiv.c linux-2.6.32.ovz/crypto/mpi/generic_udiv-w-sdiv.c --- linux-2.6.32/crypto/mpi/generic_udiv-w-sdiv.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/generic_udiv-w-sdiv.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,130 @@ +/* mpihelp_udiv_w_sdiv -- implement udiv_qrnnd on machines with only signed + * division. + * Copyright (C) 1992, 1994, 1996, 1998 Free Software Foundation, Inc. + * Contributed by Peter L. Montgomery. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" +#include "longlong.h" + + +#if 0 /* not yet ported to MPI */ + +mpi_limb_t +mpihelp_udiv_w_sdiv( mpi_limp_t *rp, + mpi_limp_t *a1, + mpi_limp_t *a0, + mpi_limp_t *d ) +{ + mp_limb_t q, r; + mp_limb_t c0, c1, b1; + + if ((mpi_limb_signed_t) d >= 0) + { + if (a1 < d - a1 - (a0 >> (BITS_PER_MP_LIMB - 1))) + { + /* dividend, divisor, and quotient are nonnegative */ + sdiv_qrnnd (q, r, a1, a0, d); + } + else + { + /* Compute c1*2^32 + c0 = a1*2^32 + a0 - 2^31*d */ + sub_ddmmss (c1, c0, a1, a0, d >> 1, d << (BITS_PER_MP_LIMB - 1)); + /* Divide (c1*2^32 + c0) by d */ + sdiv_qrnnd (q, r, c1, c0, d); + /* Add 2^31 to quotient */ + q += (mp_limb_t) 1 << (BITS_PER_MP_LIMB - 1); + } + } + else + { + b1 = d >> 1; /* d/2, between 2^30 and 2^31 - 1 */ + c1 = a1 >> 1; /* A/2 */ + c0 = (a1 << (BITS_PER_MP_LIMB - 1)) + (a0 >> 1); + + if (a1 < b1) /* A < 2^32*b1, so A/2 < 2^31*b1 */ + { + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + r = 2*r + (a0 & 1); /* Remainder from A/(2*b1) */ + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else if (c1 < b1) /* So 2^31 <= (A/2)/b1 < 2^32 */ + { + c1 = (b1 - 1) - c1; + c0 = ~c0; /* logical NOT */ + + sdiv_qrnnd (q, r, c1, c0, b1); /* (A/2) / (d/2) */ + + q = ~q; /* (A/2)/b1 */ + r = (b1 - 1) - r; + + r = 2*r + (a0 & 1); /* A/(2*b1) */ + + if ((d & 1) != 0) + { + if (r >= q) + r = r - q; + else if (q - r <= d) + { + r = r - q + d; + q--; + } + else + { + r = r - q + 2*d; + q -= 2; + } + } + } + else /* Implies c1 = b1 */ + { /* Hence a1 = d - 1 = 2*b1 - 1 */ + if (a0 >= -d) + { + q = -1; + r = a0 + d; + } + else + { + q = -2; + r = a0 + 2*d; + } + } + } + + *rp = r; + return q; +} + +#endif + diff -uprN linux-2.6.32/crypto/mpi/longlong.h linux-2.6.32.ovz/crypto/mpi/longlong.h --- linux-2.6.32/crypto/mpi/longlong.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/longlong.h 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,1502 @@ +/* longlong.h -- definitions for mixed size 32/64 bit arithmetic. + Note: I added some stuff for use with gnupg + +Copyright (C) 1991, 1992, 1993, 1994, 1996, 1998, + 2000, 2001, 2002, 2003 Free Software Foundation, Inc. + +This file is free software; you can redistribute it and/or modify +it under the terms of the GNU Library General Public License as published by +the Free Software Foundation; either version 2 of the License, or (at your +option) any later version. + +This file is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public +License for more details. + +You should have received a copy of the GNU Library General Public License +along with this file; see the file COPYING.LIB. If not, write to +the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, +MA 02111-1307, USA. */ + +/* You have to define the following before including this file: + + UWtype -- An unsigned type, default type for operations (typically a "word") + UHWtype -- An unsigned type, at least half the size of UWtype. + UDWtype -- An unsigned type, at least twice as large a UWtype + W_TYPE_SIZE -- size in bits of UWtype + + SItype, USItype -- Signed and unsigned 32 bit types. + DItype, UDItype -- Signed and unsigned 64 bit types. + + On a 32 bit machine UWtype should typically be USItype; + on a 64 bit machine, UWtype should typically be UDItype. +*/ + +#define __BITS4 (W_TYPE_SIZE / 4) +#define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2)) +#define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1)) +#define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2)) + +/* This is used to make sure no undesirable sharing between different libraries + that use this file takes place. */ +#ifndef __MPN +#define __MPN(x) __##x +#endif + +/* Define auxiliary asm macros. + + 1) umul_ppmm(high_prod, low_prod, multipler, multiplicand) multiplies two + UWtype integers MULTIPLER and MULTIPLICAND, and generates a two UWtype + word product in HIGH_PROD and LOW_PROD. + + 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a + UDWtype product. This is just a variant of umul_ppmm. + + 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator, + denominator) divides a UDWtype, composed by the UWtype integers + HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient + in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less + than DENOMINATOR for correct operation. If, in addition, the most + significant bit of DENOMINATOR must be 1, then the pre-processor symbol + UDIV_NEEDS_NORMALIZATION is defined to 1. + + 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator, + denominator). Like udiv_qrnnd but the numbers are signed. The quotient + is rounded towards 0. + + 5) count_leading_zeros(count, x) counts the number of zero-bits from the + msb to the first non-zero bit in the UWtype X. This is the number of + steps X needs to be shifted left to set the msb. Undefined for X == 0, + unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value. + + 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts + from the least significant end. + + 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1, + high_addend_2, low_addend_2) adds two UWtype integers, composed by + HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2 + respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow + (i.e. carry out) is not stored anywhere, and is lost. + + 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend, + high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers, + composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and + LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE + and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere, + and is lost. + + If any of these macros are left undefined for a particular CPU, + C macros are used. */ + +/* The CPUs come in alphabetical order below. + + Please add support for more CPUs here, or improve the current support + for the CPUs below! */ + +#if defined (__GNUC__) && !defined (NO_ASM) + +/* We sometimes need to clobber "cc" with gcc2, but that would not be + understood by gcc1. Use cpp to avoid major code duplication. */ +#if __GNUC__ < 2 +#define __CLOBBER_CC +#define __AND_CLOBBER_CC +#else /* __GNUC__ >= 2 */ +#define __CLOBBER_CC : "cc" +#define __AND_CLOBBER_CC , "cc" +#endif /* __GNUC__ < 2 */ + + +/*************************************** + ************** A29K ***************** + ***************************************/ +#if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add %1,%4,%5\n" \ + "addc %0,%2,%3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%r" ((USItype)(ah)), \ + "rI" ((USItype)(bh)), \ + "%r" ((USItype)(al)), \ + "rI" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub %1,%4,%5\n" \ + "subc %0,%2,%3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "r" ((USItype)(ah)), \ + "rI" ((USItype)(bh)), \ + "r" ((USItype)(al)), \ + "rI" ((USItype)(bl))) +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("multiplu %0,%1,%2" \ + : "=r" ((USItype)(xl)) \ + : "r" (__m0), \ + "r" (__m1)); \ + __asm__ ("multmu %0,%1,%2" \ + : "=r" ((USItype)(xh)) \ + : "r" (__m0), \ + "r" (__m1)); \ + } while (0) +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("dividu %0,%3,%4" \ + : "=r" ((USItype)(q)), \ + "=q" ((USItype)(r)) \ + : "1" ((USItype)(n1)), \ + "r" ((USItype)(n0)), \ + "r" ((USItype)(d))) + +#define count_leading_zeros(count, x) \ + __asm__ ("clz %0,%1" \ + : "=r" ((USItype)(count)) \ + : "r" ((USItype)(x))) +#define COUNT_LEADING_ZEROS_0 32 +#endif /* __a29k__ */ + + +#if defined (__alpha) && W_TYPE_SIZE == 64 +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + UDItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("umulh %r1,%2,%0" \ + : "=r" ((UDItype) ph) \ + : "%rJ" (__m0), \ + "rI" (__m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#define UMUL_TIME 46 +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { UDItype __r; \ + (q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \ + (r) = __r; \ + } while (0) +extern UDItype __udiv_qrnnd (); +#define UDIV_TIME 220 +#endif /* LONGLONG_STANDALONE */ +#endif /* __alpha */ + +/*************************************** + ************** ARM ****************** + ***************************************/ +#if defined (__arm__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("adds %1, %4, %5\n" \ + "adc %0, %2, %3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%r" ((USItype)(ah)), \ + "rI" ((USItype)(bh)), \ + "%r" ((USItype)(al)), \ + "rI" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subs %1, %4, %5\n" \ + "sbc %0, %2, %3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "r" ((USItype)(ah)), \ + "rI" ((USItype)(bh)), \ + "r" ((USItype)(al)), \ + "rI" ((USItype)(bl))) +#if defined __ARM_ARCH_2__ || defined __ARM_ARCH_3__ +#define umul_ppmm(xh, xl, a, b) \ + __asm__ ("%@ Inlined umul_ppmm\n" \ + "mov %|r0, %2, lsr #16 @ AAAA\n" \ + "mov %|r2, %3, lsr #16 @ BBBB\n" \ + "bic %|r1, %2, %|r0, lsl #16 @ aaaa\n" \ + "bic %0, %3, %|r2, lsl #16 @ bbbb\n" \ + "mul %1, %|r1, %|r2 @ aaaa * BBBB\n" \ + "mul %|r2, %|r0, %|r2 @ AAAA * BBBB\n" \ + "mul %|r1, %0, %|r1 @ aaaa * bbbb\n" \ + "mul %0, %|r0, %0 @ AAAA * bbbb\n" \ + "adds %|r0, %1, %0 @ central sum\n" \ + "addcs %|r2, %|r2, #65536\n" \ + "adds %1, %|r1, %|r0, lsl #16\n" \ + "adc %0, %|r2, %|r0, lsr #16" \ + : "=&r" ((USItype)(xh)), \ + "=r" ((USItype)(xl)) \ + : "r" ((USItype)(a)), \ + "r" ((USItype)(b)) \ + : "r0", "r1", "r2") +#else +#define umul_ppmm(xh, xl, a, b) \ + __asm__ ("%@ Inlined umul_ppmm\n" \ + "umull %r1, %r0, %r2, %r3" \ + : "=&r" ((USItype)(xh)), \ + "=r" ((USItype)(xl)) \ + : "r" ((USItype)(a)), \ + "r" ((USItype)(b)) \ + : "r0", "r1") +#endif +#define UMUL_TIME 20 +#define UDIV_TIME 100 +#endif /* __arm__ */ + +/*************************************** + ************** CLIPPER ************** + ***************************************/ +#if defined (__clipper__) && W_TYPE_SIZE == 32 +#define umul_ppmm(w1, w0, u, v) \ + ({union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __xx; \ + __asm__ ("mulwux %2,%0" \ + : "=r" (__xx.__ll) \ + : "%0" ((USItype)(u)), \ + "r" ((USItype)(v))); \ + (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;}) +#define smul_ppmm(w1, w0, u, v) \ + ({union {DItype __ll; \ + struct {SItype __l, __h;} __i; \ + } __xx; \ + __asm__ ("mulwx %2,%0" \ + : "=r" (__xx.__ll) \ + : "%0" ((SItype)(u)), \ + "r" ((SItype)(v))); \ + (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;}) +#define __umulsidi3(u, v) \ + ({UDItype __w; \ + __asm__ ("mulwux %2,%0" \ + : "=r" (__w) \ + : "%0" ((USItype)(u)), \ + "r" ((USItype)(v))); \ + __w; }) +#endif /* __clipper__ */ + + +/*************************************** + ************** GMICRO *************** + ***************************************/ +#if defined (__gmicro__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add.w %5,%1\n" \ + "addx %3,%0" \ + : "=g" ((USItype)(sh)), \ + "=&g" ((USItype)(sl)) \ + : "%0" ((USItype)(ah)), \ + "g" ((USItype)(bh)), \ + "%1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub.w %5,%1\n" \ + "subx %3,%0" \ + : "=g" ((USItype)(sh)), \ + "=&g" ((USItype)(sl)) \ + : "0" ((USItype)(ah)), \ + "g" ((USItype)(bh)), \ + "1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#define umul_ppmm(ph, pl, m0, m1) \ + __asm__ ("mulx %3,%0,%1" \ + : "=g" ((USItype)(ph)), \ + "=r" ((USItype)(pl)) \ + : "%0" ((USItype)(m0)), \ + "g" ((USItype)(m1))) +#define udiv_qrnnd(q, r, nh, nl, d) \ + __asm__ ("divx %4,%0,%1" \ + : "=g" ((USItype)(q)), \ + "=r" ((USItype)(r)) \ + : "1" ((USItype)(nh)), \ + "0" ((USItype)(nl)), \ + "g" ((USItype)(d))) +#define count_leading_zeros(count, x) \ + __asm__ ("bsch/1 %1,%0" \ + : "=g" (count) \ + : "g" ((USItype)(x)), \ + "0" ((USItype)0)) +#endif + + +/*************************************** + ************** HPPA ***************** + ***************************************/ +#if defined (__hppa) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ (" add %4,%5,%1\n" \ + " addc %2,%3,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%rM" ((USItype)(ah)), \ + "rM" ((USItype)(bh)), \ + "%rM" ((USItype)(al)), \ + "rM" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ (" sub %4,%5,%1\n" \ + " subb %2,%3,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "rM" ((USItype)(ah)), \ + "rM" ((USItype)(bh)), \ + "rM" ((USItype)(al)), \ + "rM" ((USItype)(bl))) +#if defined (_PA_RISC1_1) +#define umul_ppmm(wh, wl, u, v) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __xx; \ + __asm__ (" xmpyu %1,%2,%0" \ + : "=*f" (__xx.__ll) \ + : "*f" ((USItype)(u)), \ + "*f" ((USItype)(v))); \ + (wh) = __xx.__i.__h; \ + (wl) = __xx.__i.__l; \ + } while (0) +#define UMUL_TIME 8 +#define UDIV_TIME 60 +#else +#define UMUL_TIME 40 +#define UDIV_TIME 80 +#endif +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { USItype __r; \ + (q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \ + (r) = __r; \ + } while (0) +extern USItype __udiv_qrnnd (); +#endif /* LONGLONG_STANDALONE */ +#define count_leading_zeros(count, x) \ + do { \ + USItype __tmp; \ + __asm__ ( \ + " ldi 1,%0 \n" \ + " extru,= %1,15,16,%%r0 ; Bits 31..16 zero? \n" \ + " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \ + " ldo 16(%0),%0 ; Yes. Perform add. \n" \ + " extru,= %1,23,8,%%r0 ; Bits 15..8 zero? \n" \ + " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \ + " ldo 8(%0),%0 ; Yes. Perform add. \n" \ + " extru,= %1,27,4,%%r0 ; Bits 7..4 zero? \n" \ + " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \ + " ldo 4(%0),%0 ; Yes. Perform add. \n" \ + " extru,= %1,29,2,%%r0 ; Bits 3..2 zero? \n" \ + " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \ + " ldo 2(%0),%0 ; Yes. Perform add. \n" \ + " extru %1,30,1,%1 ; Extract bit 1. \n" \ + " sub %0,%1,%0 ; Subtract it. " \ + : "=r" (count), "=r" (__tmp) : "1" (x)); \ + } while (0) +#endif /* hppa */ + + +/*************************************** + ************** I370 ***************** + ***************************************/ +#if (defined (__i370__) || defined (__mvs__)) && W_TYPE_SIZE == 32 +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __xx; \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("mr %0,%3" \ + : "=r" (__xx.__i.__h), \ + "=r" (__xx.__i.__l) \ + : "%1" (__m0), \ + "r" (__m1)); \ + (xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \ + (xh) += ((((SItype) __m0 >> 31) & __m1) \ + + (((SItype) __m1 >> 31) & __m0)); \ + } while (0) +#define smul_ppmm(xh, xl, m0, m1) \ + do { \ + union {DItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __xx; \ + __asm__ ("mr %0,%3" \ + : "=r" (__xx.__i.__h), \ + "=r" (__xx.__i.__l) \ + : "%1" (m0), \ + "r" (m1)); \ + (xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \ + } while (0) +#define sdiv_qrnnd(q, r, n1, n0, d) \ + do { \ + union {DItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __xx; \ + __xx.__i.__h = n1; __xx.__i.__l = n0; \ + __asm__ ("dr %0,%2" \ + : "=r" (__xx.__ll) \ + : "0" (__xx.__ll), "r" (d)); \ + (q) = __xx.__i.__l; (r) = __xx.__i.__h; \ + } while (0) +#endif + + +/*************************************** + ************** I386 ***************** + ***************************************/ +#undef __i386__ +#if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addl %5,%1\n" \ + "adcl %3,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%0" ((USItype)(ah)), \ + "g" ((USItype)(bh)), \ + "%1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subl %5,%1\n" \ + "sbbl %3,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "0" ((USItype)(ah)), \ + "g" ((USItype)(bh)), \ + "1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mull %3" \ + : "=a" ((USItype)(w0)), \ + "=d" ((USItype)(w1)) \ + : "%0" ((USItype)(u)), \ + "rm" ((USItype)(v))) +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("divl %4" \ + : "=a" ((USItype)(q)), \ + "=d" ((USItype)(r)) \ + : "0" ((USItype)(n0)), \ + "1" ((USItype)(n1)), \ + "rm" ((USItype)(d))) +#define count_leading_zeros(count, x) \ + do { \ + USItype __cbtmp; \ + __asm__ ("bsrl %1,%0" \ + : "=r" (__cbtmp) : "rm" ((USItype)(x))); \ + (count) = __cbtmp ^ 31; \ + } while (0) +#define count_trailing_zeros(count, x) \ + __asm__ ("bsfl %1,%0" : "=r" (count) : "rm" ((USItype)(x))) +#ifndef UMUL_TIME +#define UMUL_TIME 40 +#endif +#ifndef UDIV_TIME +#define UDIV_TIME 40 +#endif +#endif /* 80x86 */ + + +/*************************************** + ************** I860 ***************** + ***************************************/ +#if defined (__i860__) && W_TYPE_SIZE == 32 +#define rshift_rhlc(r,h,l,c) \ + __asm__ ("shr %3,r0,r0\n" \ + "shrd %1,%2,%0" \ + "=r" (r) : "r" (h), "r" (l), "rn" (c)) +#endif /* i860 */ + +/*************************************** + ************** I960 ***************** + ***************************************/ +#if defined (__i960__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("cmpo 1,0\n" \ + "addc %5,%4,%1\n" \ + "addc %3,%2,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%dI" ((USItype)(ah)), \ + "dI" ((USItype)(bh)), \ + "%dI" ((USItype)(al)), \ + "dI" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("cmpo 0,0\n" \ + "subc %5,%4,%1\n" \ + "subc %3,%2,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "dI" ((USItype)(ah)), \ + "dI" ((USItype)(bh)), \ + "dI" ((USItype)(al)), \ + "dI" ((USItype)(bl))) +#define umul_ppmm(w1, w0, u, v) \ + ({union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __xx; \ + __asm__ ("emul %2,%1,%0" \ + : "=d" (__xx.__ll) \ + : "%dI" ((USItype)(u)), \ + "dI" ((USItype)(v))); \ + (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;}) +#define __umulsidi3(u, v) \ + ({UDItype __w; \ + __asm__ ("emul %2,%1,%0" \ + : "=d" (__w) \ + : "%dI" ((USItype)(u)), \ + "dI" ((USItype)(v))); \ + __w; }) +#define udiv_qrnnd(q, r, nh, nl, d) \ + do { \ + union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __nn; \ + __nn.__i.__h = (nh); __nn.__i.__l = (nl); \ + __asm__ ("ediv %d,%n,%0" \ + : "=d" (__rq.__ll) \ + : "dI" (__nn.__ll), \ + "dI" ((USItype)(d))); \ + (r) = __rq.__i.__l; (q) = __rq.__i.__h; \ + } while (0) +#define count_leading_zeros(count, x) \ + do { \ + USItype __cbtmp; \ + __asm__ ("scanbit %1,%0" \ + : "=r" (__cbtmp) \ + : "r" ((USItype)(x))); \ + (count) = __cbtmp ^ 31; \ + } while (0) +#define COUNT_LEADING_ZEROS_0 (-32) /* sic */ +#if defined (__i960mx) /* what is the proper symbol to test??? */ +#define rshift_rhlc(r,h,l,c) \ + do { \ + union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __nn; \ + __nn.__i.__h = (h); __nn.__i.__l = (l); \ + __asm__ ("shre %2,%1,%0" \ + : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \ + } +#endif /* i960mx */ +#endif /* i960 */ + + +/*************************************** + ************** 68000 **************** + ***************************************/ +#if (defined (__mc68000__) || defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add%.l %5,%1\n" \ + "addx%.l %3,%0" \ + : "=d" ((USItype)(sh)), \ + "=&d" ((USItype)(sl)) \ + : "%0" ((USItype)(ah)), \ + "d" ((USItype)(bh)), \ + "%1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub%.l %5,%1\n" \ + "subx%.l %3,%0" \ + : "=d" ((USItype)(sh)), \ + "=&d" ((USItype)(sl)) \ + : "0" ((USItype)(ah)), \ + "d" ((USItype)(bh)), \ + "1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#if (defined (__mc68020__) || defined (__NeXT__) || defined(mc68020)) +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("mulu%.l %3,%1:%0" \ + : "=d" ((USItype)(w0)), \ + "=d" ((USItype)(w1)) \ + : "%0" ((USItype)(u)), \ + "dmi" ((USItype)(v))) +#define UMUL_TIME 45 +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("divu%.l %4,%1:%0" \ + : "=d" ((USItype)(q)), \ + "=d" ((USItype)(r)) \ + : "0" ((USItype)(n0)), \ + "1" ((USItype)(n1)), \ + "dmi" ((USItype)(d))) +#define UDIV_TIME 90 +#define sdiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("divs%.l %4,%1:%0" \ + : "=d" ((USItype)(q)), \ + "=d" ((USItype)(r)) \ + : "0" ((USItype)(n0)), \ + "1" ((USItype)(n1)), \ + "dmi" ((USItype)(d))) +#define count_leading_zeros(count, x) \ + __asm__ ("bfffo %1{%b2:%b2},%0" \ + : "=d" ((USItype)(count)) \ + : "od" ((USItype)(x)), "n" (0)) +#define COUNT_LEADING_ZEROS_0 32 +#else /* not mc68020 */ +#define umul_ppmm(xh, xl, a, b) \ + do { USItype __umul_tmp1, __umul_tmp2; \ + __asm__ ("| Inlined umul_ppmm \n" \ + " move%.l %5,%3 \n" \ + " move%.l %2,%0 \n" \ + " move%.w %3,%1 \n" \ + " swap %3 \n" \ + " swap %0 \n" \ + " mulu %2,%1 \n" \ + " mulu %3,%0 \n" \ + " mulu %2,%3 \n" \ + " swap %2 \n" \ + " mulu %5,%2 \n" \ + " add%.l %3,%2 \n" \ + " jcc 1f \n" \ + " add%.l %#0x10000,%0 \n" \ + "1: move%.l %2,%3 \n" \ + " clr%.w %2 \n" \ + " swap %2 \n" \ + " swap %3 \n" \ + " clr%.w %3 \n" \ + " add%.l %3,%1 \n" \ + " addx%.l %2,%0 \n" \ + " | End inlined umul_ppmm" \ + : "=&d" ((USItype)(xh)), "=&d" ((USItype)(xl)), \ + "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \ + : "%2" ((USItype)(a)), "d" ((USItype)(b))); \ + } while (0) +#define UMUL_TIME 100 +#define UDIV_TIME 400 +#endif /* not mc68020 */ +#endif /* mc68000 */ + + +/*************************************** + ************** 88000 **************** + ***************************************/ +#if defined (__m88000__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addu.co %1,%r4,%r5\n" \ + "addu.ci %0,%r2,%r3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%rJ" ((USItype)(ah)), \ + "rJ" ((USItype)(bh)), \ + "%rJ" ((USItype)(al)), \ + "rJ" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subu.co %1,%r4,%r5\n" \ + "subu.ci %0,%r2,%r3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "rJ" ((USItype)(ah)), \ + "rJ" ((USItype)(bh)), \ + "rJ" ((USItype)(al)), \ + "rJ" ((USItype)(bl))) +#define count_leading_zeros(count, x) \ + do { \ + USItype __cbtmp; \ + __asm__ ("ff1 %0,%1" \ + : "=r" (__cbtmp) \ + : "r" ((USItype)(x))); \ + (count) = __cbtmp ^ 31; \ + } while (0) +#define COUNT_LEADING_ZEROS_0 63 /* sic */ +#if defined (__m88110__) +#define umul_ppmm(wh, wl, u, v) \ + do { \ + union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x; \ + __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \ + (wh) = __x.__i.__h; \ + (wl) = __x.__i.__l; \ + } while (0) +#define udiv_qrnnd(q, r, n1, n0, d) \ + ({union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __x, __q; \ + __x.__i.__h = (n1); __x.__i.__l = (n0); \ + __asm__ ("divu.d %0,%1,%2" \ + : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \ + (r) = (n0) - __q.__l * (d); (q) = __q.__l; }) +#define UMUL_TIME 5 +#define UDIV_TIME 25 +#else +#define UMUL_TIME 17 +#define UDIV_TIME 150 +#endif /* __m88110__ */ +#endif /* __m88000__ */ + +/*************************************** + ************** MIPS ***************** + ***************************************/ +#if defined (__mips__) && W_TYPE_SIZE == 32 +#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7 +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("multu %2,%3" \ + : "=l" ((USItype)(w0)), \ + "=h" ((USItype)(w1)) \ + : "d" ((USItype)(u)), \ + "d" ((USItype)(v))) +#else +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("multu %2,%3 \n" \ + "mflo %0 \n" \ + "mfhi %1" \ + : "=d" ((USItype)(w0)), \ + "=d" ((USItype)(w1)) \ + : "d" ((USItype)(u)), \ + "d" ((USItype)(v))) +#endif +#define UMUL_TIME 10 +#define UDIV_TIME 100 +#endif /* __mips__ */ + +/*************************************** + ************** MIPS/64 ************** + ***************************************/ +#if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64 +#if __GNUC__ > 2 || __GNUC_MINOR__ >= 7 +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("dmultu %2,%3" \ + : "=l" ((UDItype)(w0)), \ + "=h" ((UDItype)(w1)) \ + : "d" ((UDItype)(u)), \ + "d" ((UDItype)(v))) +#else +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("dmultu %2,%3 \n" \ + "mflo %0 \n" \ + "mfhi %1" \ + : "=d" ((UDItype)(w0)), \ + "=d" ((UDItype)(w1)) \ + : "d" ((UDItype)(u)), \ + "d" ((UDItype)(v))) +#endif +#define UMUL_TIME 20 +#define UDIV_TIME 140 +#endif /* __mips__ */ + + +/*************************************** + ************** 32000 **************** + ***************************************/ +#if defined (__ns32000__) && W_TYPE_SIZE == 32 +#define umul_ppmm(w1, w0, u, v) \ + ({union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __xx; \ + __asm__ ("meid %2,%0" \ + : "=g" (__xx.__ll) \ + : "%0" ((USItype)(u)), \ + "g" ((USItype)(v))); \ + (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;}) +#define __umulsidi3(u, v) \ + ({UDItype __w; \ + __asm__ ("meid %2,%0" \ + : "=g" (__w) \ + : "%0" ((USItype)(u)), \ + "g" ((USItype)(v))); \ + __w; }) +#define udiv_qrnnd(q, r, n1, n0, d) \ + ({union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __xx; \ + __xx.__i.__h = (n1); __xx.__i.__l = (n0); \ + __asm__ ("deid %2,%0" \ + : "=g" (__xx.__ll) \ + : "0" (__xx.__ll), \ + "g" ((USItype)(d))); \ + (r) = __xx.__i.__l; (q) = __xx.__i.__h; }) +#define count_trailing_zeros(count,x) \ + do { + __asm__ ("ffsd %2,%0" \ + : "=r" ((USItype) (count)) \ + : "0" ((USItype) 0), \ + "r" ((USItype) (x))); \ + } while (0) +#endif /* __ns32000__ */ + + +/*************************************** + ************** PPC ****************** + ***************************************/ +#if (defined (_ARCH_PPC) || defined (_IBMR2)) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ + if (__builtin_constant_p (bh) && (bh) == 0) \ + __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{aze|addze} %0,%2" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%r" ((USItype)(ah)), \ + "%r" ((USItype)(al)), \ + "rI" ((USItype)(bl))); \ + else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0) \ + __asm__ ("{a%I4|add%I4c} %1,%3,%4\n\t{ame|addme} %0,%2" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%r" ((USItype)(ah)), \ + "%r" ((USItype)(al)), \ + "rI" ((USItype)(bl))); \ + else \ + __asm__ ("{a%I5|add%I5c} %1,%4,%5\n\t{ae|adde} %0,%2,%3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%r" ((USItype)(ah)), \ + "r" ((USItype)(bh)), \ + "%r" ((USItype)(al)), \ + "rI" ((USItype)(bl))); \ + } while (0) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + if (__builtin_constant_p (ah) && (ah) == 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfze|subfze} %0,%2" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "r" ((USItype)(bh)), \ + "rI" ((USItype)(al)), \ + "r" ((USItype)(bl))); \ + else if (__builtin_constant_p (ah) && (ah) ==~(USItype) 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{sfme|subfme} %0,%2" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "r" ((USItype)(bh)), \ + "rI" ((USItype)(al)), \ + "r" ((USItype)(bl))); \ + else if (__builtin_constant_p (bh) && (bh) == 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{ame|addme} %0,%2" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "r" ((USItype)(ah)), \ + "rI" ((USItype)(al)), \ + "r" ((USItype)(bl))); \ + else if (__builtin_constant_p (bh) && (bh) ==~(USItype) 0) \ + __asm__ ("{sf%I3|subf%I3c} %1,%4,%3\n\t{aze|addze} %0,%2" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "r" ((USItype)(ah)), \ + "rI" ((USItype)(al)), \ + "r" ((USItype)(bl))); \ + else \ + __asm__ ("{sf%I4|subf%I4c} %1,%5,%4\n\t{sfe|subfe} %0,%3,%2" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "r" ((USItype)(ah)), \ + "r" ((USItype)(bh)), \ + "rI" ((USItype)(al)), \ + "r" ((USItype)(bl))); \ + } while (0) +#define count_leading_zeros(count, x) \ + __asm__ ("{cntlz|cntlzw} %0,%1" \ + : "=r" ((USItype)(count)) \ + : "r" ((USItype)(x))) +#define COUNT_LEADING_ZEROS_0 32 +#if defined (_ARCH_PPC) +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("mulhwu %0,%1,%2" \ + : "=r" ((USItype) ph) \ + : "%r" (__m0), \ + "r" (__m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#define UMUL_TIME 15 +#define smul_ppmm(ph, pl, m0, m1) \ + do { \ + SItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("mulhw %0,%1,%2" \ + : "=r" ((SItype) ph) \ + : "%r" (__m0), \ + "r" (__m1)); \ + (pl) = __m0 * __m1; \ + } while (0) +#define SMUL_TIME 14 +#define UDIV_TIME 120 +#else +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("mul %0,%2,%3" \ + : "=r" ((USItype)(xh)), \ + "=q" ((USItype)(xl)) \ + : "r" (__m0), \ + "r" (__m1)); \ + (xh) += ((((SItype) __m0 >> 31) & __m1) \ + + (((SItype) __m1 >> 31) & __m0)); \ + } while (0) +#define UMUL_TIME 8 +#define smul_ppmm(xh, xl, m0, m1) \ + __asm__ ("mul %0,%2,%3" \ + : "=r" ((SItype)(xh)), \ + "=q" ((SItype)(xl)) \ + : "r" (m0), \ + "r" (m1)) +#define SMUL_TIME 4 +#define sdiv_qrnnd(q, r, nh, nl, d) \ + __asm__ ("div %0,%2,%4" \ + : "=r" ((SItype)(q)), "=q" ((SItype)(r)) \ + : "r" ((SItype)(nh)), "1" ((SItype)(nl)), "r" ((SItype)(d))) +#define UDIV_TIME 100 +#endif +#endif /* Power architecture variants. */ + + +/*************************************** + ************** PYR ****************** + ***************************************/ +#if defined (__pyr__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addw %5,%1 \n" \ + "addwc %3,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%0" ((USItype)(ah)), \ + "g" ((USItype)(bh)), \ + "%1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subw %5,%1 \n" \ + "subwb %3,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "0" ((USItype)(ah)), \ + "g" ((USItype)(bh)), \ + "1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +/* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */ +#define umul_ppmm(w1, w0, u, v) \ + ({union {UDItype __ll; \ + struct {USItype __h, __l;} __i; \ + } __xx; \ + __asm__ ("movw %1,%R0 \n" \ + "uemul %2,%0" \ + : "=&r" (__xx.__ll) \ + : "g" ((USItype) (u)), \ + "g" ((USItype)(v))); \ + (w1) = __xx.__i.__h; (w0) = __xx.__i.__l;}) +#endif /* __pyr__ */ + + +/*************************************** + ************** RT/ROMP ************** + ***************************************/ +#if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("a %1,%5 \n" \ + "ae %0,%3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%0" ((USItype)(ah)), \ + "r" ((USItype)(bh)), \ + "%1" ((USItype)(al)), \ + "r" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("s %1,%5\n" \ + "se %0,%3" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "0" ((USItype)(ah)), \ + "r" ((USItype)(bh)), \ + "1" ((USItype)(al)), \ + "r" ((USItype)(bl))) +#define umul_ppmm(ph, pl, m0, m1) \ + do { \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ( \ + "s r2,r2 \n" \ + "mts r10,%2 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "m r2,%3 \n" \ + "cas %0,r2,r0 \n" \ + "mfs r10,%1" \ + : "=r" ((USItype)(ph)), \ + "=r" ((USItype)(pl)) \ + : "%r" (__m0), \ + "r" (__m1) \ + : "r2"); \ + (ph) += ((((SItype) __m0 >> 31) & __m1) \ + + (((SItype) __m1 >> 31) & __m0)); \ + } while (0) +#define UMUL_TIME 20 +#define UDIV_TIME 200 +#define count_leading_zeros(count, x) \ + do { \ + if ((x) >= 0x10000) \ + __asm__ ("clz %0,%1" \ + : "=r" ((USItype)(count)) \ + : "r" ((USItype)(x) >> 16)); \ + else \ + { \ + __asm__ ("clz %0,%1" \ + : "=r" ((USItype)(count)) \ + : "r" ((USItype)(x))); \ + (count) += 16; \ + } \ + } while (0) +#endif /* RT/ROMP */ + + +/*************************************** + ************** SH2 ****************** + ***************************************/ +#if (defined (__sh2__) || defined(__sh3__) || defined(__SH4__) ) \ + && W_TYPE_SIZE == 32 +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ( \ + "dmulu.l %2,%3\n" \ + "sts macl,%1\n" \ + "sts mach,%0" \ + : "=r" ((USItype)(w1)), \ + "=r" ((USItype)(w0)) \ + : "r" ((USItype)(u)), \ + "r" ((USItype)(v)) \ + : "macl", "mach") +#define UMUL_TIME 5 +#endif + +/*************************************** + ************** SPARC **************** + ***************************************/ +#if defined (__sparc__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addcc %r4,%5,%1\n" \ + "addx %r2,%3,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "%rJ" ((USItype)(ah)), \ + "rI" ((USItype)(bh)), \ + "%rJ" ((USItype)(al)), \ + "rI" ((USItype)(bl)) \ + __CLOBBER_CC) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subcc %r4,%5,%1\n" \ + "subx %r2,%3,%0" \ + : "=r" ((USItype)(sh)), \ + "=&r" ((USItype)(sl)) \ + : "rJ" ((USItype)(ah)), \ + "rI" ((USItype)(bh)), \ + "rJ" ((USItype)(al)), \ + "rI" ((USItype)(bl)) \ + __CLOBBER_CC) +#if defined (__sparc_v8__) +/* Don't match immediate range because, 1) it is not often useful, + 2) the 'I' flag thinks of the range as a 13 bit signed interval, + while we want to match a 13 bit interval, sign extended to 32 bits, + but INTERPRETED AS UNSIGNED. */ +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("umul %2,%3,%1;rd %%y,%0" \ + : "=r" ((USItype)(w1)), \ + "=r" ((USItype)(w0)) \ + : "r" ((USItype)(u)), \ + "r" ((USItype)(v))) +#define UMUL_TIME 5 +#ifndef SUPERSPARC /* SuperSPARC's udiv only handles 53 bit dividends */ +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { \ + USItype __q; \ + __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \ + : "=r" ((USItype)(__q)) \ + : "r" ((USItype)(n1)), \ + "r" ((USItype)(n0)), \ + "r" ((USItype)(d))); \ + (r) = (n0) - __q * (d); \ + (q) = __q; \ + } while (0) +#define UDIV_TIME 25 +#endif /* SUPERSPARC */ +#else /* ! __sparc_v8__ */ +#if defined (__sparclite__) +/* This has hardware multiply but not divide. It also has two additional + instructions scan (ffs from high bit) and divscc. */ +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("umul %2,%3,%1;rd %%y,%0" \ + : "=r" ((USItype)(w1)), \ + "=r" ((USItype)(w0)) \ + : "r" ((USItype)(u)), \ + "r" ((USItype)(v))) +#define UMUL_TIME 5 +#define udiv_qrnnd(q, r, n1, n0, d) \ + __asm__ ("! Inlined udiv_qrnnd \n" \ + " wr %%g0,%2,%%y ! Not a delayed write for sparclite \n" \ + " tst %%g0 \n" \ + " divscc %3,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%%g1 \n" \ + " divscc %%g1,%4,%0 \n" \ + " rd %%y,%1 \n" \ + " bl,a 1f \n" \ + " add %1,%4,%1 \n" \ + "1: ! End of inline udiv_qrnnd" \ + : "=r" ((USItype)(q)), \ + "=r" ((USItype)(r)) \ + : "r" ((USItype)(n1)), \ + "r" ((USItype)(n0)), \ + "rI" ((USItype)(d)) \ + : "%g1" __AND_CLOBBER_CC) +#define UDIV_TIME 37 +#define count_leading_zeros(count, x) \ + __asm__ ("scan %1,0,%0" \ + : "=r" ((USItype)(x)) \ + : "r" ((USItype)(count))) +/* Early sparclites return 63 for an argument of 0, but they warn that future + implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0 + undefined. */ +#endif /* __sparclite__ */ +#endif /* __sparc_v8__ */ +/* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */ +#ifndef umul_ppmm +#define umul_ppmm(w1, w0, u, v) \ + __asm__ ("! Inlined umul_ppmm \n" \ + " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr \n" \ + " sra %3,31,%%g2 ! Don't move this insn \n" \ + " and %2,%%g2,%%g2 ! Don't move this insn \n" \ + " andcc %%g0,0,%%g1 ! Don't move this insn \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,%3,%%g1 \n" \ + " mulscc %%g1,0,%%g1 \n" \ + " add %%g1,%%g2,%0 \n" \ + " rd %%y,%1" \ + : "=r" ((USItype)(w1)), \ + "=r" ((USItype)(w0)) \ + : "%rI" ((USItype)(u)), \ + "r" ((USItype)(v)) \ + : "%g1", "%g2" __AND_CLOBBER_CC) +#define UMUL_TIME 39 /* 39 instructions */ +#endif +#ifndef udiv_qrnnd +#ifndef LONGLONG_STANDALONE +#define udiv_qrnnd(q, r, n1, n0, d) \ + do { USItype __r; \ + (q) = __udiv_qrnnd (&__r, (n1), (n0), (d)); \ + (r) = __r; \ + } while (0) +extern USItype __udiv_qrnnd (); +#define UDIV_TIME 140 +#endif /* LONGLONG_STANDALONE */ +#endif /* udiv_qrnnd */ +#endif /* __sparc__ */ + + +/*************************************** + ************** VAX ****************** + ***************************************/ +#if defined (__vax__) && W_TYPE_SIZE == 32 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("addl2 %5,%1\n" \ + "adwc %3,%0" \ + : "=g" ((USItype)(sh)), \ + "=&g" ((USItype)(sl)) \ + : "%0" ((USItype)(ah)), \ + "g" ((USItype)(bh)), \ + "%1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("subl2 %5,%1\n" \ + "sbwc %3,%0" \ + : "=g" ((USItype)(sh)), \ + "=&g" ((USItype)(sl)) \ + : "0" ((USItype)(ah)), \ + "g" ((USItype)(bh)), \ + "1" ((USItype)(al)), \ + "g" ((USItype)(bl))) +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + union {UDItype __ll; \ + struct {USItype __l, __h;} __i; \ + } __xx; \ + USItype __m0 = (m0), __m1 = (m1); \ + __asm__ ("emul %1,%2,$0,%0" \ + : "=g" (__xx.__ll) \ + : "g" (__m0), \ + "g" (__m1)); \ + (xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \ + (xh) += ((((SItype) __m0 >> 31) & __m1) \ + + (((SItype) __m1 >> 31) & __m0)); \ + } while (0) +#define sdiv_qrnnd(q, r, n1, n0, d) \ + do { \ + union {DItype __ll; \ + struct {SItype __l, __h;} __i; \ + } __xx; \ + __xx.__i.__h = n1; __xx.__i.__l = n0; \ + __asm__ ("ediv %3,%2,%0,%1" \ + : "=g" (q), "=g" (r) \ + : "g" (__xx.__ll), "g" (d)); \ + } while (0) +#endif /* __vax__ */ + + +/*************************************** + ************** Z8000 **************** + ***************************************/ +#if defined (__z8000__) && W_TYPE_SIZE == 16 +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \ + : "=r" ((unsigned int)(sh)), \ + "=&r" ((unsigned int)(sl)) \ + : "%0" ((unsigned int)(ah)), \ + "r" ((unsigned int)(bh)), \ + "%1" ((unsigned int)(al)), \ + "rQR" ((unsigned int)(bl))) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \ + : "=r" ((unsigned int)(sh)), \ + "=&r" ((unsigned int)(sl)) \ + : "0" ((unsigned int)(ah)), \ + "r" ((unsigned int)(bh)), \ + "1" ((unsigned int)(al)), \ + "rQR" ((unsigned int)(bl))) +#define umul_ppmm(xh, xl, m0, m1) \ + do { \ + union {long int __ll; \ + struct {unsigned int __h, __l;} __i; \ + } __xx; \ + unsigned int __m0 = (m0), __m1 = (m1); \ + __asm__ ("mult %S0,%H3" \ + : "=r" (__xx.__i.__h), \ + "=r" (__xx.__i.__l) \ + : "%1" (__m0), \ + "rQR" (__m1)); \ + (xh) = __xx.__i.__h; (xl) = __xx.__i.__l; \ + (xh) += ((((signed int) __m0 >> 15) & __m1) \ + + (((signed int) __m1 >> 15) & __m0)); \ + } while (0) +#endif /* __z8000__ */ + +#endif /* __GNUC__ */ + + +/*************************************** + *********** Generic Versions ******** + ***************************************/ +#if !defined (umul_ppmm) && defined (__umulsidi3) +#define umul_ppmm(ph, pl, m0, m1) \ + { \ + UDWtype __ll = __umulsidi3 (m0, m1); \ + ph = (UWtype) (__ll >> W_TYPE_SIZE); \ + pl = (UWtype) __ll; \ + } +#endif + +#if !defined (__umulsidi3) +#define __umulsidi3(u, v) \ + ({UWtype __hi, __lo; \ + umul_ppmm (__hi, __lo, u, v); \ + ((UDWtype) __hi << W_TYPE_SIZE) | __lo; }) +#endif + +/* If this machine has no inline assembler, use C macros. */ + +#if !defined (add_ssaaaa) +#define add_ssaaaa(sh, sl, ah, al, bh, bl) \ + do { \ + UWtype __x; \ + __x = (al) + (bl); \ + (sh) = (ah) + (bh) + (__x < (al)); \ + (sl) = __x; \ + } while (0) +#endif + +#if !defined (sub_ddmmss) +#define sub_ddmmss(sh, sl, ah, al, bh, bl) \ + do { \ + UWtype __x; \ + __x = (al) - (bl); \ + (sh) = (ah) - (bh) - (__x > (al)); \ + (sl) = __x; \ + } while (0) +#endif + +#if !defined (umul_ppmm) +#define umul_ppmm(w1, w0, u, v) \ + do { \ + UWtype __x0, __x1, __x2, __x3; \ + UHWtype __ul, __vl, __uh, __vh; \ + UWtype __u = (u), __v = (v); \ + \ + __ul = __ll_lowpart (__u); \ + __uh = __ll_highpart (__u); \ + __vl = __ll_lowpart (__v); \ + __vh = __ll_highpart (__v); \ + \ + __x0 = (UWtype) __ul * __vl; \ + __x1 = (UWtype) __ul * __vh; \ + __x2 = (UWtype) __uh * __vl; \ + __x3 = (UWtype) __uh * __vh; \ + \ + __x1 += __ll_highpart (__x0);/* this can't give carry */ \ + __x1 += __x2; /* but this indeed can */ \ + if (__x1 < __x2) /* did we get it? */ \ + __x3 += __ll_B; /* yes, add it in the proper pos. */ \ + \ + (w1) = __x3 + __ll_highpart (__x1); \ + (w0) = (__ll_lowpart (__x1) << W_TYPE_SIZE/2) + __ll_lowpart (__x0);\ + } while (0) +#endif + +#if !defined (umul_ppmm) +#define smul_ppmm(w1, w0, u, v) \ + do { \ + UWtype __w1; \ + UWtype __m0 = (u), __m1 = (v); \ + umul_ppmm (__w1, w0, __m0, __m1); \ + (w1) = __w1 - (-(__m0 >> (W_TYPE_SIZE - 1)) & __m1) \ + - (-(__m1 >> (W_TYPE_SIZE - 1)) & __m0); \ + } while (0) +#endif + +/* Define this unconditionally, so it can be used for debugging. */ +#define __udiv_qrnnd_c(q, r, n1, n0, d) \ + do { \ + UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \ + __d1 = __ll_highpart (d); \ + __d0 = __ll_lowpart (d); \ + \ + __r1 = (n1) % __d1; \ + __q1 = (n1) / __d1; \ + __m = (UWtype) __q1 * __d0; \ + __r1 = __r1 * __ll_B | __ll_highpart (n0); \ + if (__r1 < __m) \ + { \ + __q1--, __r1 += (d); \ + if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\ + if (__r1 < __m) \ + __q1--, __r1 += (d); \ + } \ + __r1 -= __m; \ + \ + __r0 = __r1 % __d1; \ + __q0 = __r1 / __d1; \ + __m = (UWtype) __q0 * __d0; \ + __r0 = __r0 * __ll_B | __ll_lowpart (n0); \ + if (__r0 < __m) \ + { \ + __q0--, __r0 += (d); \ + if (__r0 >= (d)) \ + if (__r0 < __m) \ + __q0--, __r0 += (d); \ + } \ + __r0 -= __m; \ + \ + (q) = (UWtype) __q1 * __ll_B | __q0; \ + (r) = __r0; \ + } while (0) + +/* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through + __udiv_w_sdiv (defined in libgcc or elsewhere). */ +#if !defined (udiv_qrnnd) && defined (sdiv_qrnnd) +#define udiv_qrnnd(q, r, nh, nl, d) \ + do { \ + UWtype __r; \ + (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \ + (r) = __r; \ + } while (0) +#endif + +/* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */ +#if !defined (udiv_qrnnd) +#define UDIV_NEEDS_NORMALIZATION 1 +#define udiv_qrnnd __udiv_qrnnd_c +#endif + +#undef count_leading_zeros +#if !defined (count_leading_zeros) +extern +#ifdef __STDC__ +const +#endif +unsigned char __clz_tab[]; +#define count_leading_zeros(count, x) \ + do { \ + UWtype __xr = (x); \ + UWtype __a; \ + \ + if (W_TYPE_SIZE <= 32) \ + { \ + __a = __xr < ((UWtype) 1 << 2*__BITS4) \ + ? (__xr < ((UWtype) 1 << __BITS4) ? 0 : __BITS4) \ + : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 : 3*__BITS4);\ + } \ + else \ + { \ + for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \ + if (((__xr >> __a) & 0xff) != 0) \ + break; \ + } \ + \ + (count) = W_TYPE_SIZE - (__clz_tab[__xr >> __a] + __a); \ + } while (0) +/* This version gives a well-defined value for zero. */ +#define COUNT_LEADING_ZEROS_0 W_TYPE_SIZE +#endif + +#if !defined (count_trailing_zeros) +/* Define count_trailing_zeros using count_leading_zeros. The latter might be + defined in asm, but if it is not, the C version above is good enough. */ +#define count_trailing_zeros(count, x) \ + do { \ + UWtype __ctz_x = (x); \ + UWtype __ctz_c; \ + count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \ + (count) = W_TYPE_SIZE - 1 - __ctz_c; \ + } while (0) +#endif + +#ifndef UDIV_NEEDS_NORMALIZATION +#define UDIV_NEEDS_NORMALIZATION 0 +#endif diff -uprN linux-2.6.32/crypto/mpi/Makefile linux-2.6.32.ovz/crypto/mpi/Makefile --- linux-2.6.32/crypto/mpi/Makefile 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/Makefile 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,30 @@ +# +# MPI multiprecision maths library (from gpg) +# + +obj-$(CONFIG_CRYPTO_MPILIB) = \ + generic_mpih-lshift.o \ + generic_mpih-mul1.o \ + generic_mpih-mul2.o \ + generic_mpih-mul3.o \ + generic_mpih-rshift.o \ + generic_mpih-sub1.o \ + generic_mpih-add1.o \ + generic_udiv-w-sdiv.o \ + mpicoder.o \ + mpi-add.o \ + mpi-bit.o \ + mpi-div.o \ + mpi-cmp.o \ + mpi-gcd.o \ + mpih-cmp.o \ + mpih-div.o \ + mpih-mul.o \ + mpi-inline.o \ + mpi-inv.o \ + mpi-mpow.o \ + mpi-mul.o \ + mpi-pow.o \ + mpi-scan.o \ + mpiutil.o + diff -uprN linux-2.6.32/crypto/mpi/mpi-add.c linux-2.6.32.ovz/crypto/mpi/mpi-add.c --- linux-2.6.32/crypto/mpi/mpi-add.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-add.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,258 @@ +/* mpi-add.c - MPI functions + * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. + * Copyright (C) 1994, 1996 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" + +/**************** + * Add the unsigned integer V to the mpi-integer U and store the + * result in W. U and V may be the same. + */ +int +mpi_add_ui(MPI w, const MPI u, unsigned long v ) +{ + mpi_ptr_t wp, up; + mpi_size_t usize, wsize; + int usign, wsign; + + usize = u->nlimbs; + usign = u->sign; + wsign = 0; + + /* If not space for W (and possible carry), increase space. */ + wsize = usize + 1; + if( w->alloced < wsize ) + if (mpi_resize(w, wsize) < 0) + return -ENOMEM; + + /* These must be after realloc (U may be the same as W). */ + up = u->d; + wp = w->d; + + if( !usize ) { /* simple */ + wp[0] = v; + wsize = v? 1:0; + } + else if( !usign ) { /* mpi is not negative */ + mpi_limb_t cy; + cy = mpihelp_add_1(wp, up, usize, v); + wp[usize] = cy; + wsize = usize + cy; + } + else { /* The signs are different. Need exact comparison to determine + * which operand to subtract from which. */ + if( usize == 1 && up[0] < v ) { + wp[0] = v - up[0]; + wsize = 1; + } + else { + mpihelp_sub_1(wp, up, usize, v); + /* Size can decrease with at most one limb. */ + wsize = usize - (wp[usize-1]==0); + wsign = 1; + } + } + + w->nlimbs = wsize; + w->sign = wsign; + return 0; +} + + +int +mpi_add(MPI w, MPI u, MPI v) +{ + mpi_ptr_t wp, up, vp; + mpi_size_t usize, vsize, wsize; + int usign, vsign, wsign; + + if( u->nlimbs < v->nlimbs ) { /* Swap U and V. */ + usize = v->nlimbs; + usign = v->sign; + vsize = u->nlimbs; + vsign = u->sign; + wsize = usize + 1; + if (RESIZE_IF_NEEDED(w, wsize) < 0) + return -ENOMEM; + /* These must be after realloc (u or v may be the same as w). */ + up = v->d; + vp = u->d; + } + else { + usize = u->nlimbs; + usign = u->sign; + vsize = v->nlimbs; + vsign = v->sign; + wsize = usize + 1; + if (RESIZE_IF_NEEDED(w, wsize) < 0) + return -ENOMEM; + /* These must be after realloc (u or v may be the same as w). */ + up = u->d; + vp = v->d; + } + wp = w->d; + wsign = 0; + + if( !vsize ) { /* simple */ + MPN_COPY(wp, up, usize ); + wsize = usize; + wsign = usign; + } + else if( usign != vsign ) { /* different sign */ + /* This test is right since USIZE >= VSIZE */ + if( usize != vsize ) { + mpihelp_sub(wp, up, usize, vp, vsize); + wsize = usize; + MPN_NORMALIZE(wp, wsize); + wsign = usign; + } + else if( mpihelp_cmp(up, vp, usize) < 0 ) { + mpihelp_sub_n(wp, vp, up, usize); + wsize = usize; + MPN_NORMALIZE(wp, wsize); + if( !usign ) + wsign = 1; + } + else { + mpihelp_sub_n(wp, up, vp, usize); + wsize = usize; + MPN_NORMALIZE(wp, wsize); + if( usign ) + wsign = 1; + } + } + else { /* U and V have same sign. Add them. */ + mpi_limb_t cy = mpihelp_add(wp, up, usize, vp, vsize); + wp[usize] = cy; + wsize = usize + cy; + if( usign ) + wsign = 1; + } + + w->nlimbs = wsize; + w->sign = wsign; + return 0; +} + + +/**************** + * Subtract the unsigned integer V from the mpi-integer U and store the + * result in W. + */ +int +mpi_sub_ui(MPI w, MPI u, unsigned long v ) +{ + mpi_ptr_t wp, up; + mpi_size_t usize, wsize; + int usign, wsign; + + usize = u->nlimbs; + usign = u->sign; + wsign = 0; + + /* If not space for W (and possible carry), increase space. */ + wsize = usize + 1; + if( w->alloced < wsize ) + if (mpi_resize(w, wsize) < 0) + return -ENOMEM; + + /* These must be after realloc (U may be the same as W). */ + up = u->d; + wp = w->d; + + if( !usize ) { /* simple */ + wp[0] = v; + wsize = v? 1:0; + wsign = 1; + } + else if( usign ) { /* mpi and v are negative */ + mpi_limb_t cy; + cy = mpihelp_add_1(wp, up, usize, v); + wp[usize] = cy; + wsize = usize + cy; + } + else { /* The signs are different. Need exact comparison to determine + * which operand to subtract from which. */ + if( usize == 1 && up[0] < v ) { + wp[0] = v - up[0]; + wsize = 1; + wsign = 1; + } + else { + mpihelp_sub_1(wp, up, usize, v); + /* Size can decrease with at most one limb. */ + wsize = usize - (wp[usize-1]==0); + } + } + + w->nlimbs = wsize; + w->sign = wsign; + return 0; +} + +int +mpi_sub(MPI w, MPI u, MPI v) +{ + int rc; + + if( w == v ) { + MPI vv; + if (mpi_copy(&vv, v) < 0) + return -ENOMEM; + vv->sign = !vv->sign; + rc = mpi_add( w, u, vv ); + mpi_free(vv); + } + else { + /* fixme: this is not thread-save (we temp. modify v) */ + v->sign = !v->sign; + rc = mpi_add( w, u, v ); + v->sign = !v->sign; + } + return rc; +} + + +int +mpi_addm( MPI w, MPI u, MPI v, MPI m) +{ + if (mpi_add(w, u, v) < 0 || + mpi_fdiv_r( w, w, m ) < 0) + return -ENOMEM; + return 0; +} + +int +mpi_subm( MPI w, MPI u, MPI v, MPI m) +{ + if (mpi_sub(w, u, v) < 0 || + mpi_fdiv_r( w, w, m ) < 0) + return -ENOMEM; + return 0; +} + diff -uprN linux-2.6.32/crypto/mpi/mpi-bit.c linux-2.6.32.ovz/crypto/mpi/mpi-bit.c --- linux-2.6.32/crypto/mpi/mpi-bit.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-bit.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,245 @@ +/* mpi-bit.c - MPI bit level fucntions + * Copyright (C) 1998, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" +#include "longlong.h" + +const unsigned char __clz_tab[] = { + 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, +}; + +#define A_LIMB_1 ((mpi_limb_t) 1) + + +/**************** + * Sometimes we have MSL (most significant limbs) which are 0; + * this is for some reasons not good, so this function removes them. + */ +void +mpi_normalize( MPI a ) +{ + for( ; a->nlimbs && !a->d[a->nlimbs-1]; a->nlimbs-- ) + ; +} + + + +/**************** + * Return the number of bits in A. + */ +unsigned +mpi_get_nbits( MPI a ) +{ + unsigned n; + + mpi_normalize( a ); + + if( a->nlimbs ) { + mpi_limb_t alimb = a->d[a->nlimbs-1]; + if( alimb ) { + count_leading_zeros( n, alimb ); + } + else + n = BITS_PER_MPI_LIMB; + n = BITS_PER_MPI_LIMB - n + (a->nlimbs-1) * BITS_PER_MPI_LIMB; + } + else + n = 0; + return n; +} + + +/**************** + * Test whether bit N is set. + */ +int +mpi_test_bit( MPI a, unsigned n ) +{ + unsigned limbno, bitno; + mpi_limb_t limb; + + limbno = n / BITS_PER_MPI_LIMB; + bitno = n % BITS_PER_MPI_LIMB; + + if( limbno >= a->nlimbs ) + return 0; /* too far left: this is a 0 */ + limb = a->d[limbno]; + return (limb & (A_LIMB_1 << bitno))? 1: 0; +} + + +/**************** + * Set bit N of A. + */ +int +mpi_set_bit( MPI a, unsigned n ) +{ + unsigned limbno, bitno; + + limbno = n / BITS_PER_MPI_LIMB; + bitno = n % BITS_PER_MPI_LIMB; + + if( limbno >= a->nlimbs ) { /* resize */ + if( a->alloced >= limbno ) + if (mpi_resize(a, limbno+1 ) < 0) return -ENOMEM; + a->nlimbs = limbno+1; + } + a->d[limbno] |= (A_LIMB_1<= a->nlimbs ) { /* resize */ + if( a->alloced >= limbno ) + if (mpi_resize(a, limbno+1 ) < 0) return -ENOMEM; + a->nlimbs = limbno+1; + } + a->d[limbno] |= (A_LIMB_1<d[limbno] &= ~(A_LIMB_1 << bitno); + a->nlimbs = limbno+1; + return 0; +} + +/**************** + * clear bit N of A and all bits above + */ +void +mpi_clear_highbit( MPI a, unsigned n ) +{ + unsigned limbno, bitno; + + limbno = n / BITS_PER_MPI_LIMB; + bitno = n % BITS_PER_MPI_LIMB; + + if( limbno >= a->nlimbs ) + return; /* not allocated, so need to clear bits :-) */ + + for( ; bitno < BITS_PER_MPI_LIMB; bitno++ ) + a->d[limbno] &= ~(A_LIMB_1 << bitno); + a->nlimbs = limbno+1; +} + +/**************** + * Clear bit N of A. + */ +void +mpi_clear_bit( MPI a, unsigned n ) +{ + unsigned limbno, bitno; + + limbno = n / BITS_PER_MPI_LIMB; + bitno = n % BITS_PER_MPI_LIMB; + + if( limbno >= a->nlimbs ) + return; /* don't need to clear this bit, it's to far to left */ + a->d[limbno] &= ~(A_LIMB_1 << bitno); +} + + +/**************** + * Shift A by N bits to the right + * FIXME: should use alloc_limb if X and A are same. + */ +int +mpi_rshift( MPI x, MPI a, unsigned n ) +{ + mpi_ptr_t xp; + mpi_size_t xsize; + + xsize = a->nlimbs; + x->sign = a->sign; + if (RESIZE_IF_NEEDED(x, (size_t)xsize) < 0) return -ENOMEM; + xp = x->d; + + if( xsize ) { + mpihelp_rshift( xp, a->d, xsize, n); + MPN_NORMALIZE( xp, xsize); + } + x->nlimbs = xsize; + return 0; +} + + +/**************** + * Shift A by COUNT limbs to the left + * This is used only within the MPI library + */ +int +mpi_lshift_limbs( MPI a, unsigned int count ) +{ + mpi_ptr_t ap = a->d; + int n = a->nlimbs; + int i; + + if( !count || !n ) + return 0; + + if (RESIZE_IF_NEEDED( a, n+count ) < 0) return -ENOMEM; + + for( i = n-1; i >= 0; i-- ) + ap[i+count] = ap[i]; + for(i=0; i < count; i++ ) + ap[i] = 0; + a->nlimbs += count; + return 0; +} + + +/**************** + * Shift A by COUNT limbs to the right + * This is used only within the MPI library + */ +void +mpi_rshift_limbs( MPI a, unsigned int count ) +{ + mpi_ptr_t ap = a->d; + mpi_size_t n = a->nlimbs; + unsigned int i; + + if( count >= n ) { + a->nlimbs = 0; + return; + } + + for( i = 0; i < n - count; i++ ) + ap[i] = ap[i+count]; + ap[i] = 0; + a->nlimbs -= count; +} + + diff -uprN linux-2.6.32/crypto/mpi/mpi-cmp.c linux-2.6.32.ovz/crypto/mpi/mpi-cmp.c --- linux-2.6.32/crypto/mpi/mpi-cmp.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-cmp.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,71 @@ +/* mpi-cmp.c - MPI functions + * Copyright (C) 1998, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" + +int +mpi_cmp_ui( MPI u, unsigned long v ) +{ + mpi_limb_t limb = v; + + mpi_normalize( u ); + if( !u->nlimbs && !limb ) + return 0; + if( u->sign ) + return -1; + if( u->nlimbs > 1 ) + return 1; + + if( u->d[0] == limb ) + return 0; + else if( u->d[0] > limb ) + return 1; + else + return -1; +} + +int +mpi_cmp( MPI u, MPI v ) +{ + mpi_size_t usize, vsize; + int cmp; + + mpi_normalize( u ); + mpi_normalize( v ); + usize = u->nlimbs; + vsize = v->nlimbs; + if( !u->sign && v->sign ) + return 1; + if( u->sign && !v->sign ) + return -1; + if( usize != vsize && !u->sign && !v->sign ) + return usize - vsize; + if( usize != vsize && u->sign && v->sign ) + return vsize + usize; + if( !usize ) + return 0; + if( !(cmp=mpihelp_cmp( u->d, v->d, usize )) ) + return 0; + if( (cmp < 0?1:0) == (u->sign?1:0)) + return 1; + return -1; +} + + diff -uprN linux-2.6.32/crypto/mpi/mpicoder.c linux-2.6.32.ovz/crypto/mpi/mpicoder.c --- linux-2.6.32/crypto/mpi/mpicoder.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpicoder.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,359 @@ +/* mpicoder.c - Coder for the external representation of MPIs + * Copyright (C) 1998, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" + +#define DIM(v) (sizeof(v)/sizeof((v)[0])) +#define MAX_EXTERN_MPI_BITS 16384 + + +static uint8_t asn[15] = /* Object ID is 1.3.14.3.2.26 */ + { 0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03, + 0x02, 0x1a, 0x05, 0x00, 0x04, 0x14 }; + + + +MPI +do_encode_md(const void *sha_buffer, unsigned nbits) +{ + int nframe = (nbits+7) / 8; + uint8_t *frame, *fr_pt; + int i = 0, n; + size_t asnlen = DIM(asn); + MPI a = MPI_NULL; + + if(SHA1_DIGEST_LENGTH + asnlen + 4 > nframe ) + printk("MPI: can't encode a %d bit MD into a %d bits frame\n", + (int)(SHA1_DIGEST_LENGTH*8), (int)nbits); + + /* We encode the MD in this way: + * + * 0 A PAD(n bytes) 0 ASN(asnlen bytes) MD(len bytes) + * + * PAD consists of FF bytes. + */ + frame = kmalloc(nframe, GFP_KERNEL); + if (!frame) + return MPI_NULL; + n = 0; + frame[n++] = 0; + frame[n++] = 1; /* block type */ + i = nframe - SHA1_DIGEST_LENGTH - asnlen -3 ; + + if(i <= 1) { + printk("MPI: message digest encoding failed\n"); + kfree(frame); + return a; + } + + memset( frame+n, 0xff, i ); n += i; + frame[n++] = 0; + memcpy( frame+n, &asn, asnlen ); n += asnlen; + memcpy( frame+n, sha_buffer, SHA1_DIGEST_LENGTH ); n += SHA1_DIGEST_LENGTH; + + i = nframe; + fr_pt = frame; + + if (n != nframe) { + printk("MPI: message digest encoding failed, frame length is wrong\n"); + kfree(frame); + return a; + } + + a = mpi_alloc( (nframe+BYTES_PER_MPI_LIMB-1) / BYTES_PER_MPI_LIMB ); + mpi_set_buffer( a, frame, nframe, 0 ); + kfree(frame); + + return a; +} + + +MPI +mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) +{ + const uint8_t *buffer = xbuffer; + int i, j; + unsigned nbits, nbytes, nlimbs, nread=0; + mpi_limb_t a; + MPI val = MPI_NULL; + + if( *ret_nread < 2 ) + goto leave; + nbits = buffer[0] << 8 | buffer[1]; + + if( nbits > MAX_EXTERN_MPI_BITS ) { + printk("MPI: mpi too large (%u bits)\n", nbits); + goto leave; + } + buffer += 2; + nread = 2; + + nbytes = (nbits+7) / 8; + nlimbs = (nbytes+BYTES_PER_MPI_LIMB-1) / BYTES_PER_MPI_LIMB; + val = mpi_alloc( nlimbs ); + if (!val) + return MPI_NULL; + i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; + i %= BYTES_PER_MPI_LIMB; + val->nbits = nbits; + j= val->nlimbs = nlimbs; + val->sign = 0; + for( ; j > 0; j-- ) { + a = 0; + for(; i < BYTES_PER_MPI_LIMB; i++ ) { + if( ++nread > *ret_nread ) { + printk("MPI: mpi larger than buffer nread=%d ret_nread=%d\n", nread, *ret_nread); + goto leave; + } + a <<= 8; + a |= *buffer++; + } + i = 0; + val->d[j-1] = a; + } + + leave: + *ret_nread = nread; + return val; +} + + +/**************** + * Make an mpi from a character string. + */ +int +mpi_fromstr(MPI val, const char *str) +{ + int hexmode=0, sign=0, prepend_zero=0, i, j, c, c1, c2; + unsigned nbits, nbytes, nlimbs; + mpi_limb_t a; + + if( *str == '-' ) { + sign = 1; + str++; + } + if( *str == '0' && str[1] == 'x' ) + hexmode = 1; + else + return -EINVAL; /* other bases are not yet supported */ + str += 2; + + nbits = strlen(str)*4; + if( nbits % 8 ) + prepend_zero = 1; + nbytes = (nbits+7) / 8; + nlimbs = (nbytes+BYTES_PER_MPI_LIMB-1) / BYTES_PER_MPI_LIMB; + if( val->alloced < nlimbs ) + if (!mpi_resize(val, nlimbs )) + return -ENOMEM; + i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB; + i %= BYTES_PER_MPI_LIMB; + j= val->nlimbs = nlimbs; + val->sign = sign; + for( ; j > 0; j-- ) { + a = 0; + for(; i < BYTES_PER_MPI_LIMB; i++ ) { + if( prepend_zero ) { + c1 = '0'; + prepend_zero = 0; + } + else + c1 = *str++; + assert(c1); + c2 = *str++; + assert(c2); + if( c1 >= '0' && c1 <= '9' ) + c = c1 - '0'; + else if( c1 >= 'a' && c1 <= 'f' ) + c = c1 - 'a' + 10; + else if( c1 >= 'A' && c1 <= 'F' ) + c = c1 - 'A' + 10; + else { + mpi_clear(val); + return 1; + } + c <<= 4; + if( c2 >= '0' && c2 <= '9' ) + c |= c2 - '0'; + else if( c2 >= 'a' && c2 <= 'f' ) + c |= c2 - 'a' + 10; + else if( c2 >= 'A' && c2 <= 'F' ) + c |= c2 - 'A' + 10; + else { + mpi_clear(val); + return 1; + } + a <<= 8; + a |= c; + } + i = 0; + + val->d[j-1] = a; + } + + return 0; +} + + +/**************** + * Special function to get the low 8 bytes from an mpi. + * This can be used as a keyid; KEYID is an 2 element array. + * Return the low 4 bytes. + */ +u32 +mpi_get_keyid( const MPI a, u32 *keyid ) +{ +#if BYTES_PER_MPI_LIMB == 4 + if( keyid ) { + keyid[0] = a->nlimbs >= 2? a->d[1] : 0; + keyid[1] = a->nlimbs >= 1? a->d[0] : 0; + } + return a->nlimbs >= 1? a->d[0] : 0; +#elif BYTES_PER_MPI_LIMB == 8 + if( keyid ) { + keyid[0] = a->nlimbs? (u32)(a->d[0] >> 32) : 0; + keyid[1] = a->nlimbs? (u32)(a->d[0] & 0xffffffff) : 0; + } + return a->nlimbs? (u32)(a->d[0] & 0xffffffff) : 0; +#else + #error Make this function work with other LIMB sizes +#endif +} + + +/**************** + * Return an allocated buffer with the MPI (msb first). + * NBYTES receives the length of this buffer. Caller must free the + * return string (This function does return a 0 byte buffer with NBYTES + * set to zero if the value of A is zero. If sign is not NULL, it will + * be set to the sign of the A. + */ +void * +mpi_get_buffer( MPI a, unsigned *nbytes, int *sign ) +{ + uint8_t *p, *buffer; + mpi_limb_t alimb; + int i; + unsigned int n; + + if( sign ) + *sign = a->sign; + *nbytes = n = a->nlimbs * BYTES_PER_MPI_LIMB; + if (!n) + n++; /* avoid zero length allocation */ + p = buffer = kmalloc(n, GFP_KERNEL); + + for(i=a->nlimbs-1; i >= 0; i-- ) { + alimb = a->d[i]; +#if BYTES_PER_MPI_LIMB == 4 + *p++ = alimb >> 24; + *p++ = alimb >> 16; + *p++ = alimb >> 8; + *p++ = alimb ; +#elif BYTES_PER_MPI_LIMB == 8 + *p++ = alimb >> 56; + *p++ = alimb >> 48; + *p++ = alimb >> 40; + *p++ = alimb >> 32; + *p++ = alimb >> 24; + *p++ = alimb >> 16; + *p++ = alimb >> 8; + *p++ = alimb ; +#else +#error please implement for this limb size. +#endif + } + + /* this is sub-optimal but we need to do the shift operation + * because the caller has to free the returned buffer */ + for(p=buffer; !*p && *nbytes; p++, --*nbytes ) + ; + if( p != buffer ) + memmove(buffer,p, *nbytes); + + return buffer; +} + + +/**************** + * Use BUFFER to update MPI. + */ +int +mpi_set_buffer( MPI a, const void *xbuffer, unsigned nbytes, int sign ) +{ + const uint8_t *buffer = xbuffer, *p; + mpi_limb_t alimb; + int nlimbs; + int i; + + nlimbs = (nbytes + BYTES_PER_MPI_LIMB - 1) / BYTES_PER_MPI_LIMB; + if (RESIZE_IF_NEEDED(a, nlimbs) < 0) + return -ENOMEM; + a->sign = sign; + + for(i=0, p = buffer+nbytes-1; p >= buffer+BYTES_PER_MPI_LIMB; ) { + #if BYTES_PER_MPI_LIMB == 4 + alimb = (mpi_limb_t)*p-- ; + alimb |= (mpi_limb_t)*p-- << 8 ; + alimb |= (mpi_limb_t)*p-- << 16 ; + alimb |= (mpi_limb_t)*p-- << 24 ; + #elif BYTES_PER_MPI_LIMB == 8 + alimb = (mpi_limb_t)*p-- ; + alimb |= (mpi_limb_t)*p-- << 8 ; + alimb |= (mpi_limb_t)*p-- << 16 ; + alimb |= (mpi_limb_t)*p-- << 24 ; + alimb |= (mpi_limb_t)*p-- << 32 ; + alimb |= (mpi_limb_t)*p-- << 40 ; + alimb |= (mpi_limb_t)*p-- << 48 ; + alimb |= (mpi_limb_t)*p-- << 56 ; + #else + #error please implement for this limb size. + #endif + a->d[i++] = alimb; + } + if( p >= buffer ) { + #if BYTES_PER_MPI_LIMB == 4 + alimb = *p-- ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 8 ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 16 ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 24 ; + #elif BYTES_PER_MPI_LIMB == 8 + alimb = (mpi_limb_t)*p-- ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 8 ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 16 ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 24 ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 32 ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 40 ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 48 ; + if( p >= buffer ) alimb |= (mpi_limb_t)*p-- << 56 ; + #else + #error please implement for this limb size. + #endif + a->d[i++] = alimb; + } + a->nlimbs = i; + + if (i != nlimbs) { + printk("MPI: mpi_set_buffer: Assertion failed (%d != %d)", i, nlimbs); + BUG(); + } + return 0; +} + diff -uprN linux-2.6.32/crypto/mpi/mpi-div.c linux-2.6.32.ovz/crypto/mpi/mpi-div.c --- linux-2.6.32/crypto/mpi/mpi-div.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-div.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,345 @@ +/* mpi-div.c - MPI functions + * Copyright (C) 1994, 1996 Free Software Foundation, Inc. + * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include +#include "mpi-internal.h" +#include "longlong.h" + + +int +mpi_fdiv_r( MPI rem, MPI dividend, MPI divisor ) +{ + int rc = -ENOMEM; + int divisor_sign = divisor->sign; + MPI temp_divisor = NULL; + + /* We need the original value of the divisor after the remainder has been + * preliminary calculated. We have to copy it to temporary space if it's + * the same variable as REM. */ + if( rem == divisor ) { + if (mpi_copy( &temp_divisor, divisor ) < 0) goto nomem; + divisor = temp_divisor; + } + + if (mpi_tdiv_qr(NULL, rem, dividend, divisor ) < 0) goto nomem; + if( ((divisor_sign?1:0) ^ (dividend->sign?1:0)) && rem->nlimbs ) + if (mpi_add( rem, rem, divisor) < 0) goto nomem; + + rc = 0; + + nomem: + if( temp_divisor ) + mpi_free(temp_divisor); + return rc; +} + + +/**************** + * Division rounding the quotient towards -infinity. + * The remainder gets the same sign as the denominator. + * rem is optional + */ + +ulong +mpi_fdiv_r_ui( MPI rem, MPI dividend, ulong divisor ) +{ + mpi_limb_t rlimb; + + rlimb = mpihelp_mod_1( dividend->d, dividend->nlimbs, divisor ); + if( rlimb && dividend->sign ) + rlimb = divisor - rlimb; + + if( rem ) { + rem->d[0] = rlimb; + rem->nlimbs = rlimb? 1:0; + } + return rlimb; +} + + +int +mpi_fdiv_q( MPI quot, MPI dividend, MPI divisor ) +{ + MPI tmp = mpi_alloc( mpi_get_nlimbs(quot) ); + if (!tmp) + return -ENOMEM; + mpi_fdiv_qr( quot, tmp, dividend, divisor); + mpi_free(tmp); + return 0; +} + +int +mpi_fdiv_qr( MPI quot, MPI rem, MPI dividend, MPI divisor ) +{ + int divisor_sign = divisor->sign; + MPI temp_divisor = NULL; + + if( quot == divisor || rem == divisor ) { + if (mpi_copy( &temp_divisor, divisor ) < 0) + return -ENOMEM; + divisor = temp_divisor; + } + + if (mpi_tdiv_qr( quot, rem, dividend, divisor ) < 0) + goto nomem; + + if( (divisor_sign ^ dividend->sign) && rem->nlimbs ) { + if (mpi_sub_ui( quot, quot, 1 ) < 0) + goto nomem; + if (mpi_add( rem, rem, divisor) < 0) + goto nomem; + } + + if( temp_divisor ) + mpi_free(temp_divisor); + + return 0; + + nomem: + mpi_free(temp_divisor); + return -ENOMEM; +} + + +/* If den == quot, den needs temporary storage. + * If den == rem, den needs temporary storage. + * If num == quot, num needs temporary storage. + * If den has temporary storage, it can be normalized while being copied, + * i.e no extra storage should be allocated. + */ + +int +mpi_tdiv_r( MPI rem, MPI num, MPI den) +{ + return mpi_tdiv_qr(NULL, rem, num, den ); +} + +int +mpi_tdiv_qr( MPI quot, MPI rem, MPI num, MPI den) +{ + int rc = -ENOMEM; + mpi_ptr_t np, dp; + mpi_ptr_t qp, rp; + mpi_size_t nsize = num->nlimbs; + mpi_size_t dsize = den->nlimbs; + mpi_size_t qsize, rsize; + mpi_size_t sign_remainder = num->sign; + mpi_size_t sign_quotient = num->sign ^ den->sign; + unsigned normalization_steps; + mpi_limb_t q_limb; + mpi_ptr_t marker[5]; + int markidx=0; + + memset(marker,0,sizeof(marker)); + + /* Ensure space is enough for quotient and remainder. + * We need space for an extra limb in the remainder, because it's + * up-shifted (normalized) below. */ + rsize = nsize + 1; + if (mpi_resize( rem, rsize) < 0) goto nomem; + + qsize = rsize - dsize; /* qsize cannot be bigger than this. */ + if( qsize <= 0 ) { + if( num != rem ) { + rem->nlimbs = num->nlimbs; + rem->sign = num->sign; + MPN_COPY(rem->d, num->d, nsize); + } + if( quot ) { + /* This needs to follow the assignment to rem, in case the + * numerator and quotient are the same. */ + quot->nlimbs = 0; + quot->sign = 0; + } + return 0; + } + + if( quot ) + if (mpi_resize( quot, qsize) < 0) goto nomem; + + /* Read pointers here, when reallocation is finished. */ + np = num->d; + dp = den->d; + rp = rem->d; + + /* Optimize division by a single-limb divisor. */ + if( dsize == 1 ) { + mpi_limb_t rlimb; + if( quot ) { + qp = quot->d; + rlimb = mpihelp_divmod_1( qp, np, nsize, dp[0] ); + qsize -= qp[qsize - 1] == 0; + quot->nlimbs = qsize; + quot->sign = sign_quotient; + } + else + rlimb = mpihelp_mod_1( np, nsize, dp[0] ); + rp[0] = rlimb; + rsize = rlimb != 0?1:0; + rem->nlimbs = rsize; + rem->sign = sign_remainder; + return 0; + } + + + if( quot ) { + qp = quot->d; + /* Make sure QP and NP point to different objects. Otherwise the + * numerator would be gradually overwritten by the quotient limbs. */ + if(qp == np) { /* Copy NP object to temporary space. */ + np = marker[markidx++] = mpi_alloc_limb_space(nsize); + MPN_COPY(np, qp, nsize); + } + } + else /* Put quotient at top of remainder. */ + qp = rp + dsize; + + count_leading_zeros( normalization_steps, dp[dsize - 1] ); + + /* Normalize the denominator, i.e. make its most significant bit set by + * shifting it NORMALIZATION_STEPS bits to the left. Also shift the + * numerator the same number of steps (to keep the quotient the same!). + */ + if( normalization_steps ) { + mpi_ptr_t tp; + mpi_limb_t nlimb; + + /* Shift up the denominator setting the most significant bit of + * the most significant word. Use temporary storage not to clobber + * the original contents of the denominator. */ + tp = marker[markidx++] = mpi_alloc_limb_space(dsize); + if (!tp) goto nomem; + mpihelp_lshift( tp, dp, dsize, normalization_steps ); + dp = tp; + + /* Shift up the numerator, possibly introducing a new most + * significant word. Move the shifted numerator in the remainder + * meanwhile. */ + nlimb = mpihelp_lshift(rp, np, nsize, normalization_steps); + if( nlimb ) { + rp[nsize] = nlimb; + rsize = nsize + 1; + } + else + rsize = nsize; + } + else { + /* The denominator is already normalized, as required. Copy it to + * temporary space if it overlaps with the quotient or remainder. */ + if( dp == rp || (quot && (dp == qp))) { + mpi_ptr_t tp; + + tp = marker[markidx++] = mpi_alloc_limb_space(dsize); + if (!tp) goto nomem; + MPN_COPY( tp, dp, dsize ); + dp = tp; + } + + /* Move the numerator to the remainder. */ + if( rp != np ) + MPN_COPY(rp, np, nsize); + + rsize = nsize; + } + + q_limb = mpihelp_divrem( qp, 0, rp, rsize, dp, dsize ); + + if( quot ) { + qsize = rsize - dsize; + if(q_limb) { + qp[qsize] = q_limb; + qsize += 1; + } + + quot->nlimbs = qsize; + quot->sign = sign_quotient; + } + + rsize = dsize; + MPN_NORMALIZE (rp, rsize); + + if( normalization_steps && rsize ) { + mpihelp_rshift(rp, rp, rsize, normalization_steps); + rsize -= rp[rsize - 1] == 0?1:0; + } + + rem->nlimbs = rsize; + rem->sign = sign_remainder; + + rc = 0; + nomem: + while( markidx ) + mpi_free_limb_space(marker[--markidx]); + return rc; +} + +int +mpi_tdiv_q_2exp( MPI w, MPI u, unsigned count ) +{ + mpi_size_t usize, wsize; + mpi_size_t limb_cnt; + + usize = u->nlimbs; + limb_cnt = count / BITS_PER_MPI_LIMB; + wsize = usize - limb_cnt; + if( limb_cnt >= usize ) + w->nlimbs = 0; + else { + mpi_ptr_t wp; + mpi_ptr_t up; + + if (RESIZE_IF_NEEDED( w, wsize ) < 0) + return -ENOMEM; + wp = w->d; + up = u->d; + + count %= BITS_PER_MPI_LIMB; + if( count ) { + mpihelp_rshift( wp, up + limb_cnt, wsize, count ); + wsize -= !wp[wsize - 1]; + } + else { + MPN_COPY_INCR( wp, up + limb_cnt, wsize); + } + + w->nlimbs = wsize; + } + return 0; +} + +/**************** + * Check whether dividend is divisible by divisor + * (note: divisor must fit into a limb) + */ +int +mpi_divisible_ui(MPI dividend, ulong divisor ) +{ + return !mpihelp_mod_1( dividend->d, dividend->nlimbs, divisor ); +} + diff -uprN linux-2.6.32/crypto/mpi/mpi-gcd.c linux-2.6.32.ovz/crypto/mpi/mpi-gcd.c --- linux-2.6.32/crypto/mpi/mpi-gcd.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-gcd.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,60 @@ +/* mpi-gcd.c - MPI functions + * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" + +/**************** + * Find the greatest common divisor G of A and B. + * Return: true if this 1, false in all other cases + */ +int +mpi_gcd( MPI g, const MPI xa, const MPI xb ) +{ + MPI a = NULL, b = NULL; + + if (mpi_copy(&a, xa) < 0) + goto nomem; + + if (mpi_copy(&b, xb) < 0) + goto nomem; + + /* TAOCP Vol II, 4.5.2, Algorithm A */ + a->sign = 0; + b->sign = 0; + while( mpi_cmp_ui( b, 0 ) ) { + if (mpi_fdiv_r( g, a, b ) < 0) /* g used as temorary variable */ + goto nomem; + if (mpi_set(a,b) < 0) + goto nomem; + if (mpi_set(b,g) < 0) + goto nomem; + } + if (mpi_set(g, a) < 0) + goto nomem; + + mpi_free(a); + mpi_free(b); + return !mpi_cmp_ui( g, 1); + + nomem: + mpi_free(a); + mpi_free(b); + return -ENOMEM; +} diff -uprN linux-2.6.32/crypto/mpi/mpih-cmp.c linux-2.6.32.ovz/crypto/mpi/mpih-cmp.c --- linux-2.6.32/crypto/mpi/mpih-cmp.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpih-cmp.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,58 @@ +/* mpihelp-sub.c - MPI helper functions + * Copyright (C) 1994, 1996 Free Software Foundation, Inc. + * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" + +/**************** + * Compare OP1_PTR/OP1_SIZE with OP2_PTR/OP2_SIZE. + * There are no restrictions on the relative sizes of + * the two arguments. + * Return 1 if OP1 > OP2, 0 if they are equal, and -1 if OP1 < OP2. + */ +int +mpihelp_cmp( mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size ) +{ + mpi_size_t i; + mpi_limb_t op1_word, op2_word; + + for( i = size - 1; i >= 0 ; i--) { + op1_word = op1_ptr[i]; + op2_word = op2_ptr[i]; + if( op1_word != op2_word ) + goto diff; + } + return 0; + + diff: + /* This can *not* be simplified to + * op2_word - op2_word + * since that expression might give signed overflow. */ + return (op1_word > op2_word) ? 1 : -1; +} + diff -uprN linux-2.6.32/crypto/mpi/mpih-div.c linux-2.6.32.ovz/crypto/mpi/mpih-div.c --- linux-2.6.32/crypto/mpi/mpih-div.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpih-div.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,534 @@ +/* mpihelp-div.c - MPI helper functions + * Copyright (C) 1994, 1996 Free Software Foundation, Inc. + * Copyright (C) 1998, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include "mpi-internal.h" +#include "longlong.h" + +#ifndef UMUL_TIME + #define UMUL_TIME 1 +#endif +#ifndef UDIV_TIME + #define UDIV_TIME UMUL_TIME +#endif + +/* FIXME: We should be using invert_limb (or invert_normalized_limb) + * here (not udiv_qrnnd). + */ + +mpi_limb_t +mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, + mpi_limb_t divisor_limb) +{ + mpi_size_t i; + mpi_limb_t n1, n0, r; + int dummy; + + /* Botch: Should this be handled at all? Rely on callers? */ + if( !dividend_size ) + return 0; + + /* If multiplication is much faster than division, and the + * dividend is large, pre-invert the divisor, and use + * only multiplications in the inner loop. + * + * This test should be read: + * Does it ever help to use udiv_qrnnd_preinv? + * && Does what we save compensate for the inversion overhead? + */ + if( UDIV_TIME > (2 * UMUL_TIME + 6) + && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME ) { + int normalization_steps; + + count_leading_zeros( normalization_steps, divisor_limb ); + if( normalization_steps ) { + mpi_limb_t divisor_limb_inverted; + + divisor_limb <<= normalization_steps; + + /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The + * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the + * most significant bit (with weight 2**N) implicit. + * + * Special case for DIVISOR_LIMB == 100...000. + */ + if( !(divisor_limb << 1) ) + divisor_limb_inverted = ~(mpi_limb_t)0; + else + udiv_qrnnd(divisor_limb_inverted, dummy, + -divisor_limb, 0, divisor_limb); + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps); + + /* Possible optimization: + * if (r == 0 + * && divisor_limb > ((n1 << normalization_steps) + * | (dividend_ptr[dividend_size - 2] >> ...))) + * ...one division less... + */ + for( i = dividend_size - 2; i >= 0; i--) { + n0 = dividend_ptr[i]; + UDIV_QRNND_PREINV(dummy, r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))), + divisor_limb, divisor_limb_inverted); + n1 = n0; + } + UDIV_QRNND_PREINV(dummy, r, r, + n1 << normalization_steps, + divisor_limb, divisor_limb_inverted); + return r >> normalization_steps; + } + else { + mpi_limb_t divisor_limb_inverted; + + /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The + * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the + * most significant bit (with weight 2**N) implicit. + * + * Special case for DIVISOR_LIMB == 100...000. + */ + if( !(divisor_limb << 1) ) + divisor_limb_inverted = ~(mpi_limb_t)0; + else + udiv_qrnnd(divisor_limb_inverted, dummy, + -divisor_limb, 0, divisor_limb); + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if( r >= divisor_limb ) + r = 0; + else + i--; + + for( ; i >= 0; i--) { + n0 = dividend_ptr[i]; + UDIV_QRNND_PREINV(dummy, r, r, + n0, divisor_limb, divisor_limb_inverted); + } + return r; + } + } + else { + if( UDIV_NEEDS_NORMALIZATION ) { + int normalization_steps; + + count_leading_zeros(normalization_steps, divisor_limb); + if( normalization_steps ) { + divisor_limb <<= normalization_steps; + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps); + + /* Possible optimization: + * if (r == 0 + * && divisor_limb > ((n1 << normalization_steps) + * | (dividend_ptr[dividend_size - 2] >> ...))) + * ...one division less... + */ + for(i = dividend_size - 2; i >= 0; i--) { + n0 = dividend_ptr[i]; + udiv_qrnnd (dummy, r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))), + divisor_limb); + n1 = n0; + } + udiv_qrnnd (dummy, r, r, + n1 << normalization_steps, + divisor_limb); + return r >> normalization_steps; + } + } + /* No normalization needed, either because udiv_qrnnd doesn't require + * it, or because DIVISOR_LIMB is already normalized. */ + i = dividend_size - 1; + r = dividend_ptr[i]; + + if(r >= divisor_limb) + r = 0; + else + i--; + + for(; i >= 0; i--) { + n0 = dividend_ptr[i]; + udiv_qrnnd (dummy, r, r, n0, divisor_limb); + } + return r; + } +} + +/* Divide num (NP/NSIZE) by den (DP/DSIZE) and write + * the NSIZE-DSIZE least significant quotient limbs at QP + * and the DSIZE long remainder at NP. If QEXTRA_LIMBS is + * non-zero, generate that many fraction bits and append them after the + * other quotient limbs. + * Return the most significant limb of the quotient, this is always 0 or 1. + * + * Preconditions: + * 0. NSIZE >= DSIZE. + * 1. The most significant bit of the divisor must be set. + * 2. QP must either not overlap with the input operands at all, or + * QP + DSIZE >= NP must hold true. (This means that it's + * possible to put the quotient in the high part of NUM, right after the + * remainder in NUM. + * 3. NSIZE >= DSIZE, even if QEXTRA_LIMBS is non-zero. + */ + +mpi_limb_t +mpihelp_divrem( mpi_ptr_t qp, mpi_size_t qextra_limbs, + mpi_ptr_t np, mpi_size_t nsize, + mpi_ptr_t dp, mpi_size_t dsize) +{ + mpi_limb_t most_significant_q_limb = 0; + + switch(dsize) { + case 0: + /* We are asked to divide by zero, so go ahead and do it! (To make + the compiler not remove this statement, return the value.) */ + return 1 / dsize; + + case 1: + { + mpi_size_t i; + mpi_limb_t n1; + mpi_limb_t d; + + d = dp[0]; + n1 = np[nsize - 1]; + + if( n1 >= d ) { + n1 -= d; + most_significant_q_limb = 1; + } + + qp += qextra_limbs; + for( i = nsize - 2; i >= 0; i--) + udiv_qrnnd( qp[i], n1, n1, np[i], d ); + qp -= qextra_limbs; + + for( i = qextra_limbs - 1; i >= 0; i-- ) + udiv_qrnnd (qp[i], n1, n1, 0, d); + + np[0] = n1; + } + break; + + case 2: + { + mpi_size_t i; + mpi_limb_t n1, n0, n2; + mpi_limb_t d1, d0; + + np += nsize - 2; + d1 = dp[1]; + d0 = dp[0]; + n1 = np[1]; + n0 = np[0]; + + if( n1 >= d1 && (n1 > d1 || n0 >= d0) ) { + sub_ddmmss (n1, n0, n1, n0, d1, d0); + most_significant_q_limb = 1; + } + + for( i = qextra_limbs + nsize - 2 - 1; i >= 0; i-- ) { + mpi_limb_t q; + mpi_limb_t r; + + if( i >= qextra_limbs ) + np--; + else + np[0] = 0; + + if( n1 == d1 ) { + /* Q should be either 111..111 or 111..110. Need special + * treatment of this rare case as normal division would + * give overflow. */ + q = ~(mpi_limb_t)0; + + r = n0 + d1; + if( r < d1 ) { /* Carry in the addition? */ + add_ssaaaa( n1, n0, r - d0, np[0], 0, d0 ); + qp[i] = q; + continue; + } + n1 = d0 - (d0 != 0?1:0); + n0 = -d0; + } + else { + udiv_qrnnd (q, r, n1, n0, d1); + umul_ppmm (n1, n0, d0, q); + } + + n2 = np[0]; + q_test: + if( n1 > r || (n1 == r && n0 > n2) ) { + /* The estimated Q was too large. */ + q--; + sub_ddmmss (n1, n0, n1, n0, 0, d0); + r += d1; + if( r >= d1 ) /* If not carry, test Q again. */ + goto q_test; + } + + qp[i] = q; + sub_ddmmss (n1, n0, r, n2, n1, n0); + } + np[1] = n1; + np[0] = n0; + } + break; + + default: + { + mpi_size_t i; + mpi_limb_t dX, d1, n0; + + np += nsize - dsize; + dX = dp[dsize - 1]; + d1 = dp[dsize - 2]; + n0 = np[dsize - 1]; + + if( n0 >= dX ) { + if(n0 > dX || mpihelp_cmp(np, dp, dsize - 1) >= 0 ) { + mpihelp_sub_n(np, np, dp, dsize); + n0 = np[dsize - 1]; + most_significant_q_limb = 1; + } + } + + for( i = qextra_limbs + nsize - dsize - 1; i >= 0; i--) { + mpi_limb_t q; + mpi_limb_t n1, n2; + mpi_limb_t cy_limb; + + if( i >= qextra_limbs ) { + np--; + n2 = np[dsize]; + } + else { + n2 = np[dsize - 1]; + MPN_COPY_DECR (np + 1, np, dsize - 1); + np[0] = 0; + } + + if( n0 == dX ) { + /* This might over-estimate q, but it's probably not worth + * the extra code here to find out. */ + q = ~(mpi_limb_t)0; + } + else { + mpi_limb_t r; + + udiv_qrnnd(q, r, n0, np[dsize - 1], dX); + umul_ppmm(n1, n0, d1, q); + + while( n1 > r || (n1 == r && n0 > np[dsize - 2])) { + q--; + r += dX; + if( r < dX ) /* I.e. "carry in previous addition?" */ + break; + n1 -= n0 < d1; + n0 -= d1; + } + } + + /* Possible optimization: We already have (q * n0) and (1 * n1) + * after the calculation of q. Taking advantage of that, we + * could make this loop make two iterations less. */ + cy_limb = mpihelp_submul_1(np, dp, dsize, q); + + if( n2 != cy_limb ) { + mpihelp_add_n(np, np, dp, dsize); + q--; + } + + qp[i] = q; + n0 = np[dsize - 1]; + } + } + } + + return most_significant_q_limb; +} + + +/**************** + * Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB. + * Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR. + * Return the single-limb remainder. + * There are no constraints on the value of the divisor. + * + * QUOT_PTR and DIVIDEND_PTR might point to the same limb. + */ + +mpi_limb_t +mpihelp_divmod_1( mpi_ptr_t quot_ptr, + mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, + mpi_limb_t divisor_limb) +{ + mpi_size_t i; + mpi_limb_t n1, n0, r; + int dummy; + + if( !dividend_size ) + return 0; + + /* If multiplication is much faster than division, and the + * dividend is large, pre-invert the divisor, and use + * only multiplications in the inner loop. + * + * This test should be read: + * Does it ever help to use udiv_qrnnd_preinv? + * && Does what we save compensate for the inversion overhead? + */ + if( UDIV_TIME > (2 * UMUL_TIME + 6) + && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME ) { + int normalization_steps; + + count_leading_zeros( normalization_steps, divisor_limb ); + if( normalization_steps ) { + mpi_limb_t divisor_limb_inverted; + + divisor_limb <<= normalization_steps; + + /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The + * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the + * most significant bit (with weight 2**N) implicit. + */ + /* Special case for DIVISOR_LIMB == 100...000. */ + if( !(divisor_limb << 1) ) + divisor_limb_inverted = ~(mpi_limb_t)0; + else + udiv_qrnnd(divisor_limb_inverted, dummy, + -divisor_limb, 0, divisor_limb); + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps); + + /* Possible optimization: + * if (r == 0 + * && divisor_limb > ((n1 << normalization_steps) + * | (dividend_ptr[dividend_size - 2] >> ...))) + * ...one division less... + */ + for( i = dividend_size - 2; i >= 0; i--) { + n0 = dividend_ptr[i]; + UDIV_QRNND_PREINV( quot_ptr[i + 1], r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))), + divisor_limb, divisor_limb_inverted); + n1 = n0; + } + UDIV_QRNND_PREINV( quot_ptr[0], r, r, + n1 << normalization_steps, + divisor_limb, divisor_limb_inverted); + return r >> normalization_steps; + } + else { + mpi_limb_t divisor_limb_inverted; + + /* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB. The + * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the + * most significant bit (with weight 2**N) implicit. + */ + /* Special case for DIVISOR_LIMB == 100...000. */ + if( !(divisor_limb << 1) ) + divisor_limb_inverted = ~(mpi_limb_t) 0; + else + udiv_qrnnd(divisor_limb_inverted, dummy, + -divisor_limb, 0, divisor_limb); + + i = dividend_size - 1; + r = dividend_ptr[i]; + + if( r >= divisor_limb ) + r = 0; + else + quot_ptr[i--] = 0; + + for( ; i >= 0; i-- ) { + n0 = dividend_ptr[i]; + UDIV_QRNND_PREINV( quot_ptr[i], r, r, + n0, divisor_limb, divisor_limb_inverted); + } + return r; + } + } + else { + if(UDIV_NEEDS_NORMALIZATION) { + int normalization_steps; + + count_leading_zeros (normalization_steps, divisor_limb); + if( normalization_steps ) { + divisor_limb <<= normalization_steps; + + n1 = dividend_ptr[dividend_size - 1]; + r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps); + + /* Possible optimization: + * if (r == 0 + * && divisor_limb > ((n1 << normalization_steps) + * | (dividend_ptr[dividend_size - 2] >> ...))) + * ...one division less... + */ + for( i = dividend_size - 2; i >= 0; i--) { + n0 = dividend_ptr[i]; + udiv_qrnnd (quot_ptr[i + 1], r, r, + ((n1 << normalization_steps) + | (n0 >> (BITS_PER_MPI_LIMB - normalization_steps))), + divisor_limb); + n1 = n0; + } + udiv_qrnnd (quot_ptr[0], r, r, + n1 << normalization_steps, + divisor_limb); + return r >> normalization_steps; + } + } + /* No normalization needed, either because udiv_qrnnd doesn't require + * it, or because DIVISOR_LIMB is already normalized. */ + i = dividend_size - 1; + r = dividend_ptr[i]; + + if(r >= divisor_limb) + r = 0; + else + quot_ptr[i--] = 0; + + for(; i >= 0; i--) { + n0 = dividend_ptr[i]; + udiv_qrnnd( quot_ptr[i], r, r, n0, divisor_limb ); + } + return r; + } +} + + diff -uprN linux-2.6.32/crypto/mpi/mpih-mul.c linux-2.6.32.ovz/crypto/mpi/mpih-mul.c --- linux-2.6.32/crypto/mpi/mpih-mul.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpih-mul.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,546 @@ +/* mpihelp-mul.c - MPI helper functions + * Copyright (C) 1994, 1996, 1998, 1999, + * 2000 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include +#include "mpi-internal.h" +#include "longlong.h" + + + +#define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \ + do { \ + if( (size) < KARATSUBA_THRESHOLD ) \ + mul_n_basecase (prodp, up, vp, size); \ + else \ + mul_n (prodp, up, vp, size, tspace); \ + } while (0); + +#define MPN_SQR_N_RECURSE(prodp, up, size, tspace) \ + do { \ + if ((size) < KARATSUBA_THRESHOLD) \ + mpih_sqr_n_basecase (prodp, up, size); \ + else \ + mpih_sqr_n (prodp, up, size, tspace); \ + } while (0); + + + + +/* Multiply the natural numbers u (pointed to by UP) and v (pointed to by VP), + * both with SIZE limbs, and store the result at PRODP. 2 * SIZE limbs are + * always stored. Return the most significant limb. + * + * Argument constraints: + * 1. PRODP != UP and PRODP != VP, i.e. the destination + * must be distinct from the multiplier and the multiplicand. + * + * + * Handle simple cases with traditional multiplication. + * + * This is the most critical code of multiplication. All multiplies rely + * on this, both small and huge. Small ones arrive here immediately. Huge + * ones arrive here as this is the base case for Karatsuba's recursive + * algorithm below. + */ + +static mpi_limb_t +mul_n_basecase( mpi_ptr_t prodp, mpi_ptr_t up, + mpi_ptr_t vp, mpi_size_t size) +{ + mpi_size_t i; + mpi_limb_t cy; + mpi_limb_t v_limb; + + /* Multiply by the first limb in V separately, as the result can be + * stored (not added) to PROD. We also avoid a loop for zeroing. */ + v_limb = vp[0]; + if( v_limb <= 1 ) { + if( v_limb == 1 ) + MPN_COPY( prodp, up, size ); + else + MPN_ZERO( prodp, size ); + cy = 0; + } + else + cy = mpihelp_mul_1( prodp, up, size, v_limb ); + + prodp[size] = cy; + prodp++; + + /* For each iteration in the outer loop, multiply one limb from + * U with one limb from V, and add it to PROD. */ + for( i = 1; i < size; i++ ) { + v_limb = vp[i]; + if( v_limb <= 1 ) { + cy = 0; + if( v_limb == 1 ) + cy = mpihelp_add_n(prodp, prodp, up, size); + } + else + cy = mpihelp_addmul_1(prodp, up, size, v_limb); + + prodp[size] = cy; + prodp++; + } + + return cy; +} + + +static void +mul_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, + mpi_size_t size, mpi_ptr_t tspace ) +{ + if( size & 1 ) { + /* The size is odd, and the code below doesn't handle that. + * Multiply the least significant (size - 1) limbs with a recursive + * call, and handle the most significant limb of S1 and S2 + * separately. + * A slightly faster way to do this would be to make the Karatsuba + * code below behave as if the size were even, and let it check for + * odd size in the end. I.e., in essence move this code to the end. + * Doing so would save us a recursive call, and potentially make the + * stack grow a lot less. + */ + mpi_size_t esize = size - 1; /* even size */ + mpi_limb_t cy_limb; + + MPN_MUL_N_RECURSE( prodp, up, vp, esize, tspace ); + cy_limb = mpihelp_addmul_1( prodp + esize, up, esize, vp[esize] ); + prodp[esize + esize] = cy_limb; + cy_limb = mpihelp_addmul_1( prodp + esize, vp, size, up[esize] ); + prodp[esize + size] = cy_limb; + } + else { + /* Anatolij Alekseevich Karatsuba's divide-and-conquer algorithm. + * + * Split U in two pieces, U1 and U0, such that + * U = U0 + U1*(B**n), + * and V in V1 and V0, such that + * V = V0 + V1*(B**n). + * + * UV is then computed recursively using the identity + * + * 2n n n n + * UV = (B + B )U V + B (U -U )(V -V ) + (B + 1)U V + * 1 1 1 0 0 1 0 0 + * + * Where B = 2**BITS_PER_MP_LIMB. + */ + mpi_size_t hsize = size >> 1; + mpi_limb_t cy; + int negflg; + + /* Product H. ________________ ________________ + * |_____U1 x V1____||____U0 x V0_____| + * Put result in upper part of PROD and pass low part of TSPACE + * as new TSPACE. + */ + MPN_MUL_N_RECURSE(prodp + size, up + hsize, vp + hsize, hsize, tspace); + + /* Product M. ________________ + * |_(U1-U0)(V0-V1)_| + */ + if( mpihelp_cmp(up + hsize, up, hsize) >= 0 ) { + mpihelp_sub_n(prodp, up + hsize, up, hsize); + negflg = 0; + } + else { + mpihelp_sub_n(prodp, up, up + hsize, hsize); + negflg = 1; + } + if( mpihelp_cmp(vp + hsize, vp, hsize) >= 0 ) { + mpihelp_sub_n(prodp + hsize, vp + hsize, vp, hsize); + negflg ^= 1; + } + else { + mpihelp_sub_n(prodp + hsize, vp, vp + hsize, hsize); + /* No change of NEGFLG. */ + } + /* Read temporary operands from low part of PROD. + * Put result in low part of TSPACE using upper part of TSPACE + * as new TSPACE. + */ + MPN_MUL_N_RECURSE(tspace, prodp, prodp + hsize, hsize, tspace + size); + + /* Add/copy product H. */ + MPN_COPY (prodp + hsize, prodp + size, hsize); + cy = mpihelp_add_n( prodp + size, prodp + size, + prodp + size + hsize, hsize); + + /* Add product M (if NEGFLG M is a negative number) */ + if(negflg) + cy -= mpihelp_sub_n(prodp + hsize, prodp + hsize, tspace, size); + else + cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size); + + /* Product L. ________________ ________________ + * |________________||____U0 x V0_____| + * Read temporary operands from low part of PROD. + * Put result in low part of TSPACE using upper part of TSPACE + * as new TSPACE. + */ + MPN_MUL_N_RECURSE(tspace, up, vp, hsize, tspace + size); + + /* Add/copy Product L (twice) */ + + cy += mpihelp_add_n(prodp + hsize, prodp + hsize, tspace, size); + if( cy ) + mpihelp_add_1(prodp + hsize + size, prodp + hsize + size, hsize, cy); + + MPN_COPY(prodp, tspace, hsize); + cy = mpihelp_add_n(prodp + hsize, prodp + hsize, tspace + hsize, hsize); + if( cy ) + mpihelp_add_1(prodp + size, prodp + size, size, 1); + } +} + + +void +mpih_sqr_n_basecase( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size ) +{ + mpi_size_t i; + mpi_limb_t cy_limb; + mpi_limb_t v_limb; + + /* Multiply by the first limb in V separately, as the result can be + * stored (not added) to PROD. We also avoid a loop for zeroing. */ + v_limb = up[0]; + if( v_limb <= 1 ) { + if( v_limb == 1 ) + MPN_COPY( prodp, up, size ); + else + MPN_ZERO(prodp, size); + cy_limb = 0; + } + else + cy_limb = mpihelp_mul_1( prodp, up, size, v_limb ); + + prodp[size] = cy_limb; + prodp++; + + /* For each iteration in the outer loop, multiply one limb from + * U with one limb from V, and add it to PROD. */ + for( i=1; i < size; i++) { + v_limb = up[i]; + if( v_limb <= 1 ) { + cy_limb = 0; + if( v_limb == 1 ) + cy_limb = mpihelp_add_n(prodp, prodp, up, size); + } + else + cy_limb = mpihelp_addmul_1(prodp, up, size, v_limb); + + prodp[size] = cy_limb; + prodp++; + } +} + + +void +mpih_sqr_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace) +{ + if( size & 1 ) { + /* The size is odd, and the code below doesn't handle that. + * Multiply the least significant (size - 1) limbs with a recursive + * call, and handle the most significant limb of S1 and S2 + * separately. + * A slightly faster way to do this would be to make the Karatsuba + * code below behave as if the size were even, and let it check for + * odd size in the end. I.e., in essence move this code to the end. + * Doing so would save us a recursive call, and potentially make the + * stack grow a lot less. + */ + mpi_size_t esize = size - 1; /* even size */ + mpi_limb_t cy_limb; + + MPN_SQR_N_RECURSE( prodp, up, esize, tspace ); + cy_limb = mpihelp_addmul_1( prodp + esize, up, esize, up[esize] ); + prodp[esize + esize] = cy_limb; + cy_limb = mpihelp_addmul_1( prodp + esize, up, size, up[esize] ); + + prodp[esize + size] = cy_limb; + } + else { + mpi_size_t hsize = size >> 1; + mpi_limb_t cy; + + /* Product H. ________________ ________________ + * |_____U1 x U1____||____U0 x U0_____| + * Put result in upper part of PROD and pass low part of TSPACE + * as new TSPACE. + */ + MPN_SQR_N_RECURSE(prodp + size, up + hsize, hsize, tspace); + + /* Product M. ________________ + * |_(U1-U0)(U0-U1)_| + */ + if( mpihelp_cmp( up + hsize, up, hsize) >= 0 ) + mpihelp_sub_n( prodp, up + hsize, up, hsize); + else + mpihelp_sub_n (prodp, up, up + hsize, hsize); + + /* Read temporary operands from low part of PROD. + * Put result in low part of TSPACE using upper part of TSPACE + * as new TSPACE. */ + MPN_SQR_N_RECURSE(tspace, prodp, hsize, tspace + size); + + /* Add/copy product H */ + MPN_COPY(prodp + hsize, prodp + size, hsize); + cy = mpihelp_add_n(prodp + size, prodp + size, + prodp + size + hsize, hsize); + + /* Add product M (if NEGFLG M is a negative number). */ + cy -= mpihelp_sub_n (prodp + hsize, prodp + hsize, tspace, size); + + /* Product L. ________________ ________________ + * |________________||____U0 x U0_____| + * Read temporary operands from low part of PROD. + * Put result in low part of TSPACE using upper part of TSPACE + * as new TSPACE. */ + MPN_SQR_N_RECURSE (tspace, up, hsize, tspace + size); + + /* Add/copy Product L (twice). */ + cy += mpihelp_add_n (prodp + hsize, prodp + hsize, tspace, size); + if( cy ) + mpihelp_add_1(prodp + hsize + size, prodp + hsize + size, + hsize, cy); + + MPN_COPY(prodp, tspace, hsize); + cy = mpihelp_add_n (prodp + hsize, prodp + hsize, tspace + hsize, hsize); + if( cy ) + mpihelp_add_1 (prodp + size, prodp + size, size, 1); + } +} + + +/* This should be made into an inline function in gmp.h. */ +int +mpihelp_mul_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size) +{ + if( up == vp ) { + if( size < KARATSUBA_THRESHOLD ) + mpih_sqr_n_basecase( prodp, up, size ); + else { + mpi_ptr_t tspace; + tspace = mpi_alloc_limb_space( 2 * size ); + if (!tspace) + return -ENOMEM; + mpih_sqr_n( prodp, up, size, tspace ); + mpi_free_limb_space( tspace ); + } + } + else { + if( size < KARATSUBA_THRESHOLD ) + mul_n_basecase( prodp, up, vp, size ); + else { + mpi_ptr_t tspace; + tspace = mpi_alloc_limb_space( 2 * size ); + if (!tspace) + return -ENOMEM; + mul_n (prodp, up, vp, size, tspace); + mpi_free_limb_space( tspace ); + } + } + + return 0; +} + + + +int +mpihelp_mul_karatsuba_case( mpi_ptr_t prodp, + mpi_ptr_t up, mpi_size_t usize, + mpi_ptr_t vp, mpi_size_t vsize, + struct karatsuba_ctx *ctx ) +{ + mpi_limb_t cy; + + if( !ctx->tspace || ctx->tspace_size < vsize ) { + if( ctx->tspace ) + mpi_free_limb_space( ctx->tspace ); + ctx->tspace = mpi_alloc_limb_space( 2 * vsize); + if (!ctx->tspace) + return -ENOMEM; + ctx->tspace_size = vsize; + } + + MPN_MUL_N_RECURSE( prodp, up, vp, vsize, ctx->tspace ); + + prodp += vsize; + up += vsize; + usize -= vsize; + if( usize >= vsize ) { + if( !ctx->tp || ctx->tp_size < vsize ) { + if( ctx->tp ) + mpi_free_limb_space( ctx->tp ); + ctx->tp = mpi_alloc_limb_space( 2 * vsize ); + if (!ctx->tp) { + if( ctx->tspace ) + mpi_free_limb_space( ctx->tspace ); + ctx->tspace = NULL; + return -ENOMEM; + } + ctx->tp_size = vsize; + } + + do { + MPN_MUL_N_RECURSE( ctx->tp, up, vp, vsize, ctx->tspace ); + cy = mpihelp_add_n( prodp, prodp, ctx->tp, vsize ); + mpihelp_add_1( prodp + vsize, ctx->tp + vsize, vsize, cy ); + prodp += vsize; + up += vsize; + usize -= vsize; + } while( usize >= vsize ); + } + + if( usize ) { + if( usize < KARATSUBA_THRESHOLD ) { + mpi_limb_t tmp; + if (mpihelp_mul( ctx->tspace, vp, vsize, up, usize, &tmp) < 0) + return -ENOMEM; + } + else { + if( !ctx->next ) { + ctx->next = kzalloc( sizeof *ctx, GFP_KERNEL ); + if (!ctx->next) + return -ENOMEM; + } + if (mpihelp_mul_karatsuba_case( ctx->tspace, + vp, vsize, + up, usize, + ctx->next ) < 0) + return -ENOMEM; + } + + cy = mpihelp_add_n( prodp, prodp, ctx->tspace, vsize); + mpihelp_add_1( prodp + vsize, ctx->tspace + vsize, usize, cy ); + } + + return 0; +} + + +void +mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx ) +{ + struct karatsuba_ctx *ctx2; + + if( ctx->tp ) + mpi_free_limb_space( ctx->tp ); + if( ctx->tspace ) + mpi_free_limb_space( ctx->tspace ); + for( ctx=ctx->next; ctx; ctx = ctx2 ) { + ctx2 = ctx->next; + if( ctx->tp ) + mpi_free_limb_space( ctx->tp ); + if( ctx->tspace ) + mpi_free_limb_space( ctx->tspace ); + kfree( ctx ); + } +} + +/* Multiply the natural numbers u (pointed to by UP, with USIZE limbs) + * and v (pointed to by VP, with VSIZE limbs), and store the result at + * PRODP. USIZE + VSIZE limbs are always stored, but if the input + * operands are normalized. Return the most significant limb of the + * result. + * + * NOTE: The space pointed to by PRODP is overwritten before finished + * with U and V, so overlap is an error. + * + * Argument constraints: + * 1. USIZE >= VSIZE. + * 2. PRODP != UP and PRODP != VP, i.e. the destination + * must be distinct from the multiplier and the multiplicand. + */ + +int +mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, + mpi_ptr_t vp, mpi_size_t vsize, + mpi_limb_t *_result) +{ + mpi_ptr_t prod_endp = prodp + usize + vsize - 1; + mpi_limb_t cy; + struct karatsuba_ctx ctx; + + if( vsize < KARATSUBA_THRESHOLD ) { + mpi_size_t i; + mpi_limb_t v_limb; + + if( !vsize ) { + *_result = 0; + return 0; + } + + /* Multiply by the first limb in V separately, as the result can be + * stored (not added) to PROD. We also avoid a loop for zeroing. */ + v_limb = vp[0]; + if( v_limb <= 1 ) { + if( v_limb == 1 ) + MPN_COPY( prodp, up, usize ); + else + MPN_ZERO( prodp, usize ); + cy = 0; + } + else + cy = mpihelp_mul_1( prodp, up, usize, v_limb ); + + prodp[usize] = cy; + prodp++; + + /* For each iteration in the outer loop, multiply one limb from + * U with one limb from V, and add it to PROD. */ + for( i = 1; i < vsize; i++ ) { + v_limb = vp[i]; + if( v_limb <= 1 ) { + cy = 0; + if( v_limb == 1 ) + cy = mpihelp_add_n(prodp, prodp, up, usize); + } + else + cy = mpihelp_addmul_1(prodp, up, usize, v_limb); + + prodp[usize] = cy; + prodp++; + } + + *_result = cy; + return 0; + } + + memset( &ctx, 0, sizeof ctx ); + if (mpihelp_mul_karatsuba_case( prodp, up, usize, vp, vsize, &ctx ) < 0) + return -ENOMEM; + mpihelp_release_karatsuba_ctx( &ctx ); + *_result = *prod_endp; + return 0; +} + + diff -uprN linux-2.6.32/crypto/mpi/mpi-inline.c linux-2.6.32.ovz/crypto/mpi/mpi-inline.c --- linux-2.6.32/crypto/mpi/mpi-inline.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-inline.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,33 @@ +/* mpi-inline.c + * Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + + +/* put the inline functions as real functions into the lib */ +#define G10_MPI_INLINE_DECL + +#include "mpi-internal.h" + +/* always include the header becuase it is only + * included by mpi-internal if __GCC__ is defined but we + * need it here in all cases and the above definition of + * of the macro allows us to do so + */ +#include "mpi-inline.h" + diff -uprN linux-2.6.32/crypto/mpi/mpi-inline.h linux-2.6.32.ovz/crypto/mpi/mpi-inline.h --- linux-2.6.32/crypto/mpi/mpi-inline.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-inline.h 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,128 @@ +/* mpi-inline.h - Internal to the Multi Precision Integers + * Copyright (C) 1994, 1996, 1998, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#ifndef G10_MPI_INLINE_H +#define G10_MPI_INLINE_H + +#ifndef G10_MPI_INLINE_DECL + #define G10_MPI_INLINE_DECL extern __inline__ +#endif + +G10_MPI_INLINE_DECL mpi_limb_t +mpihelp_add_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb) +{ + mpi_limb_t x; + + x = *s1_ptr++; + s2_limb += x; + *res_ptr++ = s2_limb; + if( s2_limb < x ) { /* sum is less than the left operand: handle carry */ + while( --s1_size ) { + x = *s1_ptr++ + 1; /* add carry */ + *res_ptr++ = x; /* and store */ + if( x ) /* not 0 (no overflow): we can stop */ + goto leave; + } + return 1; /* return carry (size of s1 to small) */ + } + + leave: + if( res_ptr != s1_ptr ) { /* not the same variable */ + mpi_size_t i; /* copy the rest */ + for( i=0; i < s1_size-1; i++ ) + res_ptr[i] = s1_ptr[i]; + } + return 0; /* no carry */ +} + + + +G10_MPI_INLINE_DECL mpi_limb_t +mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, + mpi_ptr_t s2_ptr, mpi_size_t s2_size) +{ + mpi_limb_t cy = 0; + + if( s2_size ) + cy = mpihelp_add_n( res_ptr, s1_ptr, s2_ptr, s2_size ); + + if( s1_size - s2_size ) + cy = mpihelp_add_1( res_ptr + s2_size, s1_ptr + s2_size, + s1_size - s2_size, cy); + return cy; +} + + +G10_MPI_INLINE_DECL mpi_limb_t +mpihelp_sub_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb ) +{ + mpi_limb_t x; + + x = *s1_ptr++; + s2_limb = x - s2_limb; + *res_ptr++ = s2_limb; + if( s2_limb > x ) { + while( --s1_size ) { + x = *s1_ptr++; + *res_ptr++ = x - 1; + if( x ) + goto leave; + } + return 1; + } + + leave: + if( res_ptr != s1_ptr ) { + mpi_size_t i; + for( i=0; i < s1_size-1; i++ ) + res_ptr[i] = s1_ptr[i]; + } + return 0; +} + + + +G10_MPI_INLINE_DECL mpi_limb_t +mpihelp_sub( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, + mpi_ptr_t s2_ptr, mpi_size_t s2_size) +{ + mpi_limb_t cy = 0; + + if( s2_size ) + cy = mpihelp_sub_n(res_ptr, s1_ptr, s2_ptr, s2_size); + + if( s1_size - s2_size ) + cy = mpihelp_sub_1(res_ptr + s2_size, s1_ptr + s2_size, + s1_size - s2_size, cy); + return cy; +} + + +#endif /*G10_MPI_INLINE_H*/ diff -uprN linux-2.6.32/crypto/mpi/mpi-internal.h linux-2.6.32.ovz/crypto/mpi/mpi-internal.h --- linux-2.6.32/crypto/mpi/mpi-internal.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-internal.h 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,265 @@ +/* mpi-internal.h - Internal to the Multi Precision Integers + * Copyright (C) 1994, 1996 Free Software Foundation, Inc. + * Copyright (C) 1998, 2000 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#ifndef G10_MPI_INTERNAL_H +#define G10_MPI_INTERNAL_H + +#include +#include +#include +#include +#include + +#define log_debug printk +#define log_bug printk + +#define assert(x) do { \ + if (!x) log_bug("failed assertion\n"); \ + } while(0); + +/* If KARATSUBA_THRESHOLD is not already defined, define it to a + * value which is good on most machines. */ + +/* tested 4, 16, 32 and 64, where 16 gave the best performance when + * checking a 768 and a 1024 bit ElGamal signature. + * (wk 22.12.97) */ +#ifndef KARATSUBA_THRESHOLD + #define KARATSUBA_THRESHOLD 16 +#endif + +/* The code can't handle KARATSUBA_THRESHOLD smaller than 2. */ +#if KARATSUBA_THRESHOLD < 2 + #undef KARATSUBA_THRESHOLD + #define KARATSUBA_THRESHOLD 2 +#endif + + +typedef mpi_limb_t *mpi_ptr_t; /* pointer to a limb */ +typedef int mpi_size_t; /* (must be a signed type) */ + +#define ABS(x) (x >= 0 ? x : -x) +#define MIN(l,o) ((l) < (o) ? (l) : (o)) +#define MAX(h,i) ((h) > (i) ? (h) : (i)) + +static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) +{ + if (a->alloced < b) + return mpi_resize(a,b); + return 0; +} + +/* Copy N limbs from S to D. */ +#define MPN_COPY( d, s, n) \ + do { \ + mpi_size_t _i; \ + for( _i = 0; _i < (n); _i++ ) \ + (d)[_i] = (s)[_i]; \ + } while(0) + +#define MPN_COPY_INCR( d, s, n) \ + do { \ + mpi_size_t _i; \ + for( _i = 0; _i < (n); _i++ ) \ + (d)[_i] = (d)[_i]; \ + } while (0) + +#define MPN_COPY_DECR( d, s, n ) \ + do { \ + mpi_size_t _i; \ + for( _i = (n)-1; _i >= 0; _i--) \ + (d)[_i] = (s)[_i]; \ + } while(0) + +/* Zero N limbs at D */ +#define MPN_ZERO(d, n) \ + do { \ + int _i; \ + for( _i = 0; _i < (n); _i++ ) \ + (d)[_i] = 0; \ + } while (0) + +#define MPN_NORMALIZE(d, n) \ + do { \ + while( (n) > 0 ) { \ + if( (d)[(n)-1] ) \ + break; \ + (n)--; \ + } \ + } while(0) + +#define MPN_NORMALIZE_NOT_ZERO(d, n) \ + do { \ + for(;;) { \ + if( (d)[(n)-1] ) \ + break; \ + (n)--; \ + } \ + } while(0) + +#define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \ + do { \ + if( (size) < KARATSUBA_THRESHOLD ) \ + mul_n_basecase (prodp, up, vp, size); \ + else \ + mul_n (prodp, up, vp, size, tspace); \ + } while (0); + + +/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest + * limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB). + * If this would yield overflow, DI should be the largest possible number + * (i.e., only ones). For correct operation, the most significant bit of D + * has to be set. Put the quotient in Q and the remainder in R. + */ +#define UDIV_QRNND_PREINV(q, r, nh, nl, d, di) \ + do { \ + mpi_limb_t _q, _ql, _r; \ + mpi_limb_t _xh, _xl; \ + umul_ppmm (_q, _ql, (nh), (di)); \ + _q += (nh); /* DI is 2**BITS_PER_MPI_LIMB too small */ \ + umul_ppmm (_xh, _xl, _q, (d)); \ + sub_ddmmss (_xh, _r, (nh), (nl), _xh, _xl); \ + if( _xh ) { \ + sub_ddmmss (_xh, _r, _xh, _r, 0, (d)); \ + _q++; \ + if( _xh) { \ + sub_ddmmss (_xh, _r, _xh, _r, 0, (d)); \ + _q++; \ + } \ + } \ + if( _r >= (d) ) { \ + _r -= (d); \ + _q++; \ + } \ + (r) = _r; \ + (q) = _q; \ + } while (0) + + +/*-- mpiutil.c --*/ +mpi_ptr_t mpi_alloc_limb_space( unsigned nlimbs ); +void mpi_free_limb_space( mpi_ptr_t a ); +void mpi_assign_limb_space( MPI a, mpi_ptr_t ap, unsigned nlimbs ); + +/*-- mpi-bit.c --*/ +void mpi_rshift_limbs( MPI a, unsigned int count ); +int mpi_lshift_limbs( MPI a, unsigned int count ); + + +/*-- mpihelp-add.c --*/ +mpi_limb_t mpihelp_add_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb ); +mpi_limb_t mpihelp_add_n( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_ptr_t s2_ptr, mpi_size_t size); +mpi_limb_t mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, + mpi_ptr_t s2_ptr, mpi_size_t s2_size); + +/*-- mpihelp-sub.c --*/ +mpi_limb_t mpihelp_sub_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb ); +mpi_limb_t mpihelp_sub_n( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_ptr_t s2_ptr, mpi_size_t size); +mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, + mpi_ptr_t s2_ptr, mpi_size_t s2_size); + +/*-- mpihelp-cmp.c --*/ +int mpihelp_cmp( mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size ); + +/*-- mpihelp-mul.c --*/ + +struct karatsuba_ctx { + struct karatsuba_ctx *next; + mpi_ptr_t tspace; + mpi_size_t tspace_size; + mpi_ptr_t tp; + mpi_size_t tp_size; +}; + +void mpihelp_release_karatsuba_ctx( struct karatsuba_ctx *ctx ); + +mpi_limb_t mpihelp_addmul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb); +mpi_limb_t mpihelp_submul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb); +int mpihelp_mul_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, + mpi_size_t size); +int mpihelp_mul( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, + mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result); +void mpih_sqr_n_basecase( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size ); +void mpih_sqr_n( mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, + mpi_ptr_t tspace); + +int mpihelp_mul_karatsuba_case( mpi_ptr_t prodp, + mpi_ptr_t up, mpi_size_t usize, + mpi_ptr_t vp, mpi_size_t vsize, + struct karatsuba_ctx *ctx ); + + +/*-- mpihelp-mul_1.c (or xxx/cpu/ *.S) --*/ +mpi_limb_t mpihelp_mul_1( mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, + mpi_size_t s1_size, mpi_limb_t s2_limb); + +/*-- mpihelp-div.c --*/ +mpi_limb_t mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, + mpi_limb_t divisor_limb); +mpi_limb_t mpihelp_divrem( mpi_ptr_t qp, mpi_size_t qextra_limbs, + mpi_ptr_t np, mpi_size_t nsize, + mpi_ptr_t dp, mpi_size_t dsize); +mpi_limb_t mpihelp_divmod_1( mpi_ptr_t quot_ptr, + mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, + mpi_limb_t divisor_limb); + +/*-- mpihelp-shift.c --*/ +mpi_limb_t mpihelp_lshift( mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, + unsigned cnt); +mpi_limb_t mpihelp_rshift( mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, + unsigned cnt); + + +/* Define stuff for longlong.h. */ +#define W_TYPE_SIZE BITS_PER_MPI_LIMB + typedef mpi_limb_t UWtype; + typedef unsigned int UHWtype; +#if defined (__GNUC__) + typedef unsigned int UQItype __attribute__ ((mode (QI))); + typedef int SItype __attribute__ ((mode (SI))); + typedef unsigned int USItype __attribute__ ((mode (SI))); + typedef int DItype __attribute__ ((mode (DI))); + typedef unsigned int UDItype __attribute__ ((mode (DI))); +#else + typedef unsigned char UQItype; + typedef long SItype; + typedef unsigned long USItype; +#endif + +#ifdef __GNUC__ + #include "mpi-inline.h" +#endif + +#endif /*G10_MPI_INTERNAL_H*/ diff -uprN linux-2.6.32/crypto/mpi/mpi-inv.c linux-2.6.32.ovz/crypto/mpi/mpi-inv.c --- linux-2.6.32/crypto/mpi/mpi-inv.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-inv.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,148 @@ +/* mpi-inv.c - MPI functions + * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" + + +/**************** + * Calculate the multiplicative inverse X of A mod N + * That is: Find the solution x for + * 1 = (a*x) mod n + */ +int +mpi_invm( MPI x, const MPI a, const MPI n ) +{ + /* Extended Euclid's algorithm (See TAOPC Vol II, 4.5.2, Alg X) + * modified according to Michael Penk's solution for Exercice 35 + * with further enhancement */ + MPI u = NULL, v = NULL; + MPI u1 = NULL, u2 = NULL, u3 = NULL; + MPI v1 = NULL, v2 = NULL, v3 = NULL; + MPI t1 = NULL, t2 = NULL, t3 = NULL; + unsigned k; + int sign; + int odd = 0; + int rc = -ENOMEM; + + if (mpi_copy(&u, a) < 0) goto cleanup; + if (mpi_copy(&v, n) < 0) goto cleanup; + + for(k=0; !mpi_test_bit(u,0) && !mpi_test_bit(v,0); k++ ) { + if (mpi_rshift(u, u, 1) < 0) goto cleanup; + if (mpi_rshift(v, v, 1) < 0) goto cleanup; + } + odd = mpi_test_bit(v,0); + + u1 = mpi_alloc_set_ui(1); if (!u1) goto cleanup; + if( !odd ) { + u2 = mpi_alloc_set_ui(0); + if (!u2) goto cleanup; + } + if (mpi_copy(&u3, u) < 0) goto cleanup; + if (mpi_copy(&v1, v) < 0) goto cleanup; + if( !odd ) { + v2 = mpi_alloc( mpi_get_nlimbs(u) ); if (!v2) goto cleanup; + if (mpi_sub( v2, u1, u ) < 0) goto cleanup; /* U is used as const 1 */ + } + if (mpi_copy(&v3, v) < 0) goto cleanup; + if( mpi_test_bit(u, 0) ) { /* u is odd */ + t1 = mpi_alloc_set_ui(0); if (!t1) goto cleanup; + if( !odd ) { + t2 = mpi_alloc_set_ui(1); if (!t2) goto cleanup; + t2->sign = 1; + } + if (mpi_copy(&t3, v) < 0) goto cleanup; + t3->sign = !t3->sign; + goto Y4; + } + else { + t1 = mpi_alloc_set_ui(1); if (!t1) goto cleanup; + if( !odd ) { + t2 = mpi_alloc_set_ui(0); if (!t2) goto cleanup; + } + if (mpi_copy(&t3, u) < 0) goto cleanup; + } + do { + do { + if( !odd ) { + if( mpi_test_bit(t1, 0) || mpi_test_bit(t2, 0) ) { /* one is odd */ + if (mpi_add(t1, t1, v) < 0) goto cleanup; + if (mpi_sub(t2, t2, u) < 0) goto cleanup; + } + if (mpi_rshift(t1, t1, 1) < 0) goto cleanup; + if (mpi_rshift(t2, t2, 1) < 0) goto cleanup; + if (mpi_rshift(t3, t3, 1) < 0) goto cleanup; + } + else { + if( mpi_test_bit(t1, 0) ) + if (mpi_add(t1, t1, v) < 0) goto cleanup; + if (mpi_rshift(t1, t1, 1) < 0) goto cleanup; + if (mpi_rshift(t3, t3, 1) < 0) goto cleanup; + } + Y4: + ; + } while( !mpi_test_bit( t3, 0 ) ); /* while t3 is even */ + + if( !t3->sign ) { + if (mpi_set(u1, t1) < 0) goto cleanup; + if( !odd ) + if (mpi_set(u2, t2) < 0) goto cleanup; + if (mpi_set(u3, t3) < 0) goto cleanup; + } + else { + if (mpi_sub(v1, v, t1) < 0) goto cleanup; + sign = u->sign; u->sign = !u->sign; + if( !odd ) + if (mpi_sub(v2, u, t2) < 0) goto cleanup; + u->sign = sign; + sign = t3->sign; t3->sign = !t3->sign; + if (mpi_set(v3, t3) < 0) goto cleanup; + t3->sign = sign; + } + if (mpi_sub(t1, u1, v1) < 0) goto cleanup; + if( !odd ) + if (mpi_sub(t2, u2, v2) < 0) goto cleanup; + if (mpi_sub(t3, u3, v3) < 0) goto cleanup; + if( t1->sign ) { + if (mpi_add(t1, t1, v) < 0) goto cleanup; + if( !odd ) + if (mpi_sub(t2, t2, u) < 0) goto cleanup; + } + } while( mpi_cmp_ui( t3, 0 ) ); /* while t3 != 0 */ + /* mpi_lshift( u3, k ); */ + rc = mpi_set(x, u1); + + cleanup: + mpi_free(u1); + mpi_free(v1); + mpi_free(t1); + if( !odd ) { + mpi_free(u2); + mpi_free(v2); + mpi_free(t2); + } + mpi_free(u3); + mpi_free(v3); + mpi_free(t3); + + mpi_free(u); + mpi_free(v); + return rc; +} diff -uprN linux-2.6.32/crypto/mpi/mpi-mpow.c linux-2.6.32.ovz/crypto/mpi/mpi-mpow.c --- linux-2.6.32/crypto/mpi/mpi-mpow.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-mpow.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,113 @@ +/* mpi-mpow.c - MPI functions + * Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" +#include "longlong.h" + + +static int +build_index(const MPI *exparray, int k, int i, int t ) +{ + int j, bitno; + int index = 0; + + bitno = t-i; + for(j=k-1; j >= 0; j-- ) { + index <<= 1; + if( mpi_test_bit( exparray[j], bitno ) ) + index |= 1; + } + return index; +} + +/**************** + * RES = (BASE[0] ^ EXP[0]) * (BASE[1] ^ EXP[1]) * ... * mod M + */ +int +mpi_mulpowm( MPI res, MPI *basearray, MPI *exparray, MPI m) +{ + int rc = -ENOMEM; + int k; /* number of elements */ + int t; /* bit size of largest exponent */ + int i, j, idx; + MPI *G = NULL; /* table with precomputed values of size 2^k */ + MPI tmp = NULL; + + for(k=0; basearray[k]; k++ ) + ; + if (!k) { printk("mpi_mulpowm: assert(k) failed\n"); BUG(); } + for(t=0, i=0; (tmp=exparray[i]); i++ ) { + j = mpi_get_nbits(tmp); + if( j > t ) + t = j; + } + if (i!=k) { printk("mpi_mulpowm: assert(i==k) failed\n"); BUG(); } + if (!t) { printk("mpi_mulpowm: assert(t) failed\n"); BUG(); } + if (k>=10) { printk("mpi_mulpowm: assert(k<10) failed\n"); BUG(); } + + G = kzalloc( (1<= 0 && idx < (1<= 0 && idx < (1<nlimbs; + sign = mult->sign; + + if( !size || !small_mult ) { + prod->nlimbs = 0; + prod->sign = 0; + return 0; + } + + prod_size = size + 1; + if( prod->alloced < prod_size ) + if (mpi_resize( prod, prod_size ) < 0) + return -ENOMEM; + prod_ptr = prod->d; + + cy = mpihelp_mul_1( prod_ptr, mult->d, size, (mpi_limb_t)small_mult ); + if( cy ) + prod_ptr[size++] = cy; + prod->nlimbs = size; + prod->sign = sign; + return 0; +} + + +int +mpi_mul_2exp( MPI w, MPI u, unsigned long cnt) +{ + mpi_size_t usize, wsize, limb_cnt; + mpi_ptr_t wp; + mpi_limb_t wlimb; + int usign, wsign; + + usize = u->nlimbs; + usign = u->sign; + + if( !usize ) { + w->nlimbs = 0; + w->sign = 0; + return 0; + } + + limb_cnt = cnt / BITS_PER_MPI_LIMB; + wsize = usize + limb_cnt + 1; + if( w->alloced < wsize ) + if (mpi_resize(w, wsize ) < 0) + return -ENOMEM; + wp = w->d; + wsize = usize + limb_cnt; + wsign = usign; + + cnt %= BITS_PER_MPI_LIMB; + if( cnt ) { + wlimb = mpihelp_lshift( wp + limb_cnt, u->d, usize, cnt ); + if( wlimb ) { + wp[wsize] = wlimb; + wsize++; + } + } + else { + MPN_COPY_DECR( wp + limb_cnt, u->d, usize ); + } + + /* Zero all whole limbs at low end. Do it here and not before calling + * mpn_lshift, not to lose for U == W. */ + MPN_ZERO( wp, limb_cnt ); + + w->nlimbs = wsize; + w->sign = wsign; + return 0; +} + +int +mpi_mul( MPI w, MPI u, MPI v) +{ + int rc = -ENOMEM; + mpi_size_t usize, vsize, wsize; + mpi_ptr_t up, vp, wp; + mpi_limb_t cy; + int usign, vsign, sign_product; + int assign_wp=0; + mpi_ptr_t tmp_limb=NULL; + + + if( u->nlimbs < v->nlimbs ) { /* Swap U and V. */ + usize = v->nlimbs; + usign = v->sign; + up = v->d; + vsize = u->nlimbs; + vsign = u->sign; + vp = u->d; + } + else { + usize = u->nlimbs; + usign = u->sign; + up = u->d; + vsize = v->nlimbs; + vsign = v->sign; + vp = v->d; + } + sign_product = usign ^ vsign; + wp = w->d; + + /* Ensure W has space enough to store the result. */ + wsize = usize + vsize; + if( w->alloced < (size_t)wsize ) { + if( wp == up || wp == vp ) { + wp = mpi_alloc_limb_space(wsize); + if (!wp) goto nomem; + assign_wp = 1; + } + else { + if (mpi_resize(w, wsize ) < 0) goto nomem; + wp = w->d; + } + } + else { /* Make U and V not overlap with W. */ + if( wp == up ) { + /* W and U are identical. Allocate temporary space for U. */ + up = tmp_limb = mpi_alloc_limb_space( usize); + if (!up) goto nomem; + /* Is V identical too? Keep it identical with U. */ + if( wp == vp ) + vp = up; + /* Copy to the temporary space. */ + MPN_COPY( up, wp, usize ); + } + else if( wp == vp ) { + /* W and V are identical. Allocate temporary space for V. */ + vp = tmp_limb = mpi_alloc_limb_space( vsize); + if (!vp) goto nomem; + /* Copy to the temporary space. */ + MPN_COPY( vp, wp, vsize ); + } + } + + if( !vsize ) + wsize = 0; + else { + if (mpihelp_mul( wp, up, usize, vp, vsize, &cy) < 0) + goto nomem; + wsize -= cy? 0:1; + } + + if( assign_wp ) + mpi_assign_limb_space( w, wp, wsize ); + + w->nlimbs = wsize; + w->sign = sign_product; + rc = 0; + nomem: + if( tmp_limb ) + mpi_free_limb_space( tmp_limb ); + return rc; +} + +int +mpi_mulm( MPI w, MPI u, MPI v, MPI m) +{ + if (mpi_mul(w, u, v) < 0) + return -ENOMEM; + return mpi_fdiv_r( w, w, m ); +} diff -uprN linux-2.6.32/crypto/mpi/mpi-pow.c linux-2.6.32.ovz/crypto/mpi/mpi-pow.c --- linux-2.6.32/crypto/mpi/mpi-pow.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-pow.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,312 @@ +/* mpi-pow.c - MPI functions + * Copyright (C) 1994, 1996, 1998, 2000 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + * Note: This code is heavily based on the GNU MP Library. + * Actually it's the same code with only minor changes in the + * way the data is stored; this is to support the abstraction + * of an optional secure memory allocation which may be used + * to avoid revealing of sensitive data due to paging etc. + * The GNU MP Library itself is published under the LGPL; + * however I decided to publish this code under the plain GPL. + */ + +#include +#include "mpi-internal.h" +#include "longlong.h" + + +/**************** + * RES = BASE ^ EXP mod MOD + */ +int +mpi_powm( MPI res, MPI base, MPI exp, MPI mod) +{ + mpi_ptr_t mp_marker=NULL, bp_marker=NULL, ep_marker=NULL; + mpi_ptr_t xp_marker=NULL; + mpi_ptr_t tspace = NULL; + mpi_ptr_t rp, ep, mp, bp; + mpi_size_t esize, msize, bsize, rsize; + int esign, msign, bsign, rsign; + mpi_size_t size; + int mod_shift_cnt; + int negative_result; + int assign_rp=0; + mpi_size_t tsize=0; /* to avoid compiler warning */ + /* fixme: we should check that the warning is void*/ + int rc = -ENOMEM; + + esize = exp->nlimbs; + msize = mod->nlimbs; + size = 2 * msize; + esign = exp->sign; + msign = mod->sign; + + rp = res->d; + ep = exp->d; + + if( !msize ) + msize = 1 / msize; /* provoke a signal */ + + if( !esize ) { + /* Exponent is zero, result is 1 mod MOD, i.e., 1 or 0 + * depending on if MOD equals 1. */ + rp[0] = 1; + res->nlimbs = (msize == 1 && mod->d[0] == 1) ? 0 : 1; + res->sign = 0; + goto leave; + } + + /* Normalize MOD (i.e. make its most significant bit set) as required by + * mpn_divrem. This will make the intermediate values in the calculation + * slightly larger, but the correct result is obtained after a final + * reduction using the original MOD value. */ + mp = mp_marker = mpi_alloc_limb_space(msize); + if (!mp) + goto enomem; + count_leading_zeros( mod_shift_cnt, mod->d[msize-1] ); + if( mod_shift_cnt ) + mpihelp_lshift( mp, mod->d, msize, mod_shift_cnt ); + else + MPN_COPY( mp, mod->d, msize ); + + bsize = base->nlimbs; + bsign = base->sign; + if( bsize > msize ) { /* The base is larger than the module. Reduce it. */ + /* Allocate (BSIZE + 1) with space for remainder and quotient. + * (The quotient is (bsize - msize + 1) limbs.) */ + bp = bp_marker = mpi_alloc_limb_space( bsize + 1); + if (!bp) + goto enomem; + MPN_COPY( bp, base->d, bsize ); + /* We don't care about the quotient, store it above the remainder, + * at BP + MSIZE. */ + mpihelp_divrem( bp + msize, 0, bp, bsize, mp, msize ); + bsize = msize; + /* Canonicalize the base, since we are going to multiply with it + * quite a few times. */ + MPN_NORMALIZE( bp, bsize ); + } + else + bp = base->d; + + if( !bsize ) { + res->nlimbs = 0; + res->sign = 0; + goto leave; + } + + if( res->alloced < size ) { + /* We have to allocate more space for RES. If any of the input + * parameters are identical to RES, defer deallocation of the old + * space. */ + if( rp == ep || rp == mp || rp == bp ) { + rp = mpi_alloc_limb_space(size); + if (!rp) + goto enomem; + assign_rp = 1; + } + else { + if (mpi_resize( res, size ) < 0) + goto enomem; + rp = res->d; + } + } + else { /* Make BASE, EXP and MOD not overlap with RES. */ + if( rp == bp ) { + /* RES and BASE are identical. Allocate temp. space for BASE. */ + BUG_ON(bp_marker); + bp = bp_marker = mpi_alloc_limb_space(bsize); + if (!bp) + goto enomem; + MPN_COPY(bp, rp, bsize); + } + if( rp == ep ) { + /* RES and EXP are identical. Allocate temp. space for EXP. */ + ep = ep_marker = mpi_alloc_limb_space(esize); + if (!ep) + goto enomem; + MPN_COPY(ep, rp, esize); + } + if( rp == mp ) { + /* RES and MOD are identical. Allocate temporary space for MOD.*/ + BUG_ON(mp_marker); + mp = mp_marker = mpi_alloc_limb_space(msize); + if (!mp) + goto enomem; + MPN_COPY(mp, rp, msize); + } + } + + MPN_COPY( rp, bp, bsize ); + rsize = bsize; + rsign = bsign; + + { + mpi_size_t i; + mpi_ptr_t xp; + int c; + mpi_limb_t e; + mpi_limb_t carry_limb; + struct karatsuba_ctx karactx; + + xp = xp_marker = mpi_alloc_limb_space(2 * (msize + 1)); + if (xp) + goto enomem; + + memset( &karactx, 0, sizeof karactx ); + negative_result = (ep[0] & 1) && base->sign; + + i = esize - 1; + e = ep[i]; + count_leading_zeros (c, e); + e = (e << c) << 1; /* shift the exp bits to the left, lose msb */ + c = BITS_PER_MPI_LIMB - 1 - c; + + /* Main loop. + * + * Make the result be pointed to alternately by XP and RP. This + * helps us avoid block copying, which would otherwise be necessary + * with the overlap restrictions of mpihelp_divmod. With 50% probability + * the result after this loop will be in the area originally pointed + * by RP (==RES->d), and with 50% probability in the area originally + * pointed to by XP. + */ + + for(;;) { + while( c ) { + mpi_ptr_t tp; + mpi_size_t xsize; + + /*if (mpihelp_mul_n(xp, rp, rp, rsize) < 0) goto enomem */ + if( rsize < KARATSUBA_THRESHOLD ) + mpih_sqr_n_basecase( xp, rp, rsize ); + else { + if( !tspace ) { + tsize = 2 * rsize; + tspace = mpi_alloc_limb_space(tsize); + if (!tspace) + goto enomem; + } + else if( tsize < (2*rsize) ) { + mpi_free_limb_space( tspace ); + tsize = 2 * rsize; + tspace = mpi_alloc_limb_space(tsize); + if (!tspace) + goto enomem; + } + mpih_sqr_n( xp, rp, rsize, tspace ); + } + + xsize = 2 * rsize; + if( xsize > msize ) { + mpihelp_divrem(xp + msize, 0, xp, xsize, mp, msize); + xsize = msize; + } + + tp = rp; rp = xp; xp = tp; + rsize = xsize; + + if( (mpi_limb_signed_t)e < 0 ) { + /*mpihelp_mul( xp, rp, rsize, bp, bsize );*/ + if( bsize < KARATSUBA_THRESHOLD ) { + mpi_limb_t tmp; + if (mpihelp_mul( xp, rp, rsize, bp, bsize, &tmp ) < 0) + goto enomem; + } + else { + if (mpihelp_mul_karatsuba_case( + xp, rp, rsize, bp, bsize, &karactx ) < 0) + goto enomem; + } + + xsize = rsize + bsize; + if( xsize > msize ) { + mpihelp_divrem(xp + msize, 0, xp, xsize, mp, msize); + xsize = msize; + } + + tp = rp; rp = xp; xp = tp; + rsize = xsize; + } + e <<= 1; + c--; + } + + i--; + if( i < 0 ) + break; + e = ep[i]; + c = BITS_PER_MPI_LIMB; + } + + /* We shifted MOD, the modulo reduction argument, left MOD_SHIFT_CNT + * steps. Adjust the result by reducing it with the original MOD. + * + * Also make sure the result is put in RES->d (where it already + * might be, see above). + */ + if( mod_shift_cnt ) { + carry_limb = mpihelp_lshift( res->d, rp, rsize, mod_shift_cnt); + rp = res->d; + if( carry_limb ) { + rp[rsize] = carry_limb; + rsize++; + } + } + else { + MPN_COPY( res->d, rp, rsize); + rp = res->d; + } + + if( rsize >= msize ) { + mpihelp_divrem(rp + msize, 0, rp, rsize, mp, msize); + rsize = msize; + } + + /* Remove any leading zero words from the result. */ + if( mod_shift_cnt ) + mpihelp_rshift( rp, rp, rsize, mod_shift_cnt); + MPN_NORMALIZE (rp, rsize); + + mpihelp_release_karatsuba_ctx( &karactx ); + } + + if( negative_result && rsize ) { + if( mod_shift_cnt ) + mpihelp_rshift( mp, mp, msize, mod_shift_cnt); + mpihelp_sub( rp, mp, msize, rp, rsize); + rsize = msize; + rsign = msign; + MPN_NORMALIZE(rp, rsize); + } + res->nlimbs = rsize; + res->sign = rsign; + + leave: + rc = 0; + enomem: + if( assign_rp ) mpi_assign_limb_space( res, rp, size ); + if( mp_marker ) mpi_free_limb_space( mp_marker ); + if( bp_marker ) mpi_free_limb_space( bp_marker ); + if( ep_marker ) mpi_free_limb_space( ep_marker ); + if( xp_marker ) mpi_free_limb_space( xp_marker ); + if( tspace ) mpi_free_limb_space( tspace ); + return rc; +} + diff -uprN linux-2.6.32/crypto/mpi/mpi-scan.c linux-2.6.32.ovz/crypto/mpi/mpi-scan.c --- linux-2.6.32/crypto/mpi/mpi-scan.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpi-scan.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,129 @@ +/* mpi-scan.c - MPI functions + * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" +#include "longlong.h" + +/**************** + * Scan through an mpi and return byte for byte. a -1 is returned to indicate + * the end of the mpi. Scanning is done from the lsb to the msb, returned + * values are in the range of 0 .. 255. + * + * FIXME: This code is VERY ugly! + */ +int +mpi_getbyte( const MPI a, unsigned idx ) +{ + int i, j; + unsigned n; + mpi_ptr_t ap; + mpi_limb_t limb; + + ap = a->d; + for(n=0,i=0; i < a->nlimbs; i++ ) { + limb = ap[i]; + for( j=0; j < BYTES_PER_MPI_LIMB; j++, n++ ) + if( n == idx ) + return (limb >> j*8) & 0xff; + } + return -1; +} + + +/**************** + * Put a value at position IDX into A. idx counts from lsb to msb + */ +void +mpi_putbyte( MPI a, unsigned idx, int xc ) +{ + int i, j; + unsigned n; + mpi_ptr_t ap; + mpi_limb_t limb, c; + + c = xc & 0xff; + ap = a->d; + for(n=0,i=0; i < a->alloced; i++ ) { + limb = ap[i]; + for( j=0; j < BYTES_PER_MPI_LIMB; j++, n++ ) + if( n == idx ) { + #if BYTES_PER_MPI_LIMB == 4 + if( j == 0 ) + limb = (limb & 0xffffff00) | c; + else if( j == 1 ) + limb = (limb & 0xffff00ff) | (c<<8); + else if( j == 2 ) + limb = (limb & 0xff00ffff) | (c<<16); + else + limb = (limb & 0x00ffffff) | (c<<24); + #elif BYTES_PER_MPI_LIMB == 8 + if( j == 0 ) + limb = (limb & 0xffffffffffffff00) | c; + else if( j == 1 ) + limb = (limb & 0xffffffffffff00ff) | (c<<8); + else if( j == 2 ) + limb = (limb & 0xffffffffff00ffff) | (c<<16); + else if( j == 3 ) + limb = (limb & 0xffffffff00ffffff) | (c<<24); + else if( j == 4 ) + limb = (limb & 0xffffff00ffffffff) | (c<<32); + else if( j == 5 ) + limb = (limb & 0xffff00ffffffffff) | (c<<40); + else if( j == 6 ) + limb = (limb & 0xff00ffffffffffff) | (c<<48); + else + limb = (limb & 0x00ffffffffffffff) | (c<<56); + #else + #error please enhance this function, its ugly - i know. + #endif + if( a->nlimbs <= i ) + a->nlimbs = i+1; + ap[i] = limb; + return; + } + } + log_bug("index out of range\n"); +} + + +/**************** + * Count the number of zerobits at the low end of A + */ +unsigned +mpi_trailing_zeros( const MPI a ) +{ + unsigned n, count = 0; + + for(n=0; n < a->nlimbs; n++ ) { + if( a->d[n] ) { + unsigned nn; + mpi_limb_t alimb = a->d[n]; + + count_trailing_zeros( nn, alimb ); + count += nn; + break; + } + count += BITS_PER_MPI_LIMB; + } + return count; + +} + + diff -uprN linux-2.6.32/crypto/mpi/mpiutil.c linux-2.6.32.ovz/crypto/mpi/mpiutil.c --- linux-2.6.32/crypto/mpi/mpiutil.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/mpi/mpiutil.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,213 @@ +/* mpiutil.ac - Utility functions for MPI + * Copyright (C) 1998, 1999 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "mpi-internal.h" + + +/**************** + * Note: It was a bad idea to use the number of limbs to allocate + * because on a alpha the limbs are large but we normally need + * integers of n bits - So we should chnage this to bits (or bytes). + * + * But mpi_alloc is used in a lot of places :-) + */ +MPI mpi_alloc(unsigned nlimbs) +{ + MPI a; + + a = (MPI) kmalloc(sizeof *a,GFP_KERNEL); + if (!a) + return a; + + if (nlimbs) { + a->d = mpi_alloc_limb_space(nlimbs); + if (!a->d) { + kfree(a); + return NULL; + } + } + else { + a->d = NULL; + } + + a->alloced = nlimbs; + a->nlimbs = 0; + a->sign = 0; + a->flags = 0; + a->nbits = 0; + return a; +} + +mpi_ptr_t mpi_alloc_limb_space( unsigned nlimbs ) +{ + size_t len = nlimbs * sizeof(mpi_limb_t); + + return kmalloc(len,GFP_KERNEL); +} + +void mpi_free_limb_space( mpi_ptr_t a ) +{ + if( !a ) + return; + + kfree(a); +} + + +void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs) +{ + mpi_free_limb_space(a->d); + a->d = ap; + a->alloced = nlimbs; +} + +/**************** + * Resize the array of A to NLIMBS. the additional space is cleared + * (set to 0) [done by m_realloc()] + */ +int mpi_resize(MPI a, unsigned nlimbs) +{ + void *p; + + if (nlimbs <= a->alloced) + return 0; /* no need to do it */ + + if (a->d) { + p = kmalloc(nlimbs * sizeof(mpi_limb_t), GFP_KERNEL); + if (!p) + return -ENOMEM; + memcpy(p, a->d, a->alloced * sizeof(mpi_limb_t)); + kfree(a->d); + a->d = p; + } else { + a->d = kzalloc( nlimbs * sizeof(mpi_limb_t), GFP_KERNEL); + if (!a->d) + return -ENOMEM; + } + a->alloced = nlimbs; + return 0; +} + +void mpi_clear(MPI a) +{ + a->nlimbs = 0; + a->nbits = 0; + a->flags = 0; +} + +void mpi_free(MPI a) +{ + if (!a) + return; + + if (a->flags & 4) + kfree(a->d); + else { + mpi_free_limb_space(a->d); + } + + if (a->flags & ~7 ) + printk("invalid flag value in mpi\n"); + kfree(a); +} + + +/**************** + * Note: This copy function should not interpret the MPI + * but copy it transparently. + */ +int mpi_copy(MPI *copied, const MPI a ) +{ + size_t i; + MPI b; + + *copied = MPI_NULL; + + if ( a ) { + b = mpi_alloc( a->nlimbs ); + if (!b) + return -ENOMEM; + + b->nlimbs = a->nlimbs; + b->sign = a->sign; + b->flags = a->flags; + b->nbits = a->nbits; + + for (i = 0; i < b->nlimbs; i++ ) + b->d[i] = a->d[i]; + + *copied = b; + } + + return 0; +} + + +int mpi_set(MPI w, const MPI u) +{ + mpi_ptr_t wp, up; + mpi_size_t usize = u->nlimbs; + int usign = u->sign; + + if (RESIZE_IF_NEEDED(w, (size_t) usize) < 0) + return -ENOMEM; + + wp = w->d; + up = u->d; + MPN_COPY(wp, up, usize); + w->nlimbs = usize; + w->nbits = u->nbits; + w->flags = u->flags; + w->sign = usign; + return 0; +} + + +int mpi_set_ui(MPI w, unsigned long u) +{ + if (RESIZE_IF_NEEDED(w, 1) < 0) + return -ENOMEM; + w->d[0] = u; + w->nlimbs = u? 1:0; + w->sign = 0; + w->nbits = 0; + w->flags = 0; + return 0; +} + +MPI mpi_alloc_set_ui(unsigned long u) +{ + MPI w = mpi_alloc(1); + if (!w) + return w; + w->d[0] = u; + w->nlimbs = u? 1:0; + w->sign = 0; + return w; +} + + +void mpi_swap(MPI a, MPI b) +{ + struct gcry_mpi tmp; + + tmp = *a; *a = *b; *b = tmp; +} + diff -uprN linux-2.6.32/crypto/sha1_generic.c linux-2.6.32.ovz/crypto/sha1_generic.c --- linux-2.6.32/crypto/sha1_generic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/sha1_generic.c 2015-06-23 04:16:16.000000000 -0700 @@ -36,7 +36,7 @@ static int sha1_init(struct shash_desc * return 0; } -static int sha1_update(struct shash_desc *desc, const u8 *data, +int crypto_sha1_update(struct shash_desc *desc, const u8 *data, unsigned int len) { struct sha1_state *sctx = shash_desc_ctx(desc); @@ -70,6 +70,7 @@ static int sha1_update(struct shash_desc return 0; } +EXPORT_SYMBOL(crypto_sha1_update); /* Add padding and return the message digest. */ @@ -86,10 +87,10 @@ static int sha1_final(struct shash_desc /* Pad out to 56 mod 64 */ index = sctx->count & 0x3f; padlen = (index < 56) ? (56 - index) : ((64+56) - index); - sha1_update(desc, padding, padlen); + crypto_sha1_update(desc, padding, padlen); /* Append length */ - sha1_update(desc, (const u8 *)&bits, sizeof(bits)); + crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); /* Store state in digest */ for (i = 0; i < 5; i++) @@ -120,7 +121,7 @@ static int sha1_import(struct shash_desc static struct shash_alg alg = { .digestsize = SHA1_DIGEST_SIZE, .init = sha1_init, - .update = sha1_update, + .update = crypto_sha1_update, .final = sha1_final, .export = sha1_export, .import = sha1_import, diff -uprN linux-2.6.32/crypto/sha512_generic.c linux-2.6.32.ovz/crypto/sha512_generic.c --- linux-2.6.32/crypto/sha512_generic.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/sha512_generic.c 2015-03-10 13:23:16.000000000 -0700 @@ -177,7 +177,7 @@ sha512_update(struct shash_desc *desc, c index = sctx->count[0] & 0x7f; /* Update number of bytes */ - if (!(sctx->count[0] += len)) + if ((sctx->count[0] += len) < len) sctx->count[1]++; part_len = 128 - index; diff -uprN linux-2.6.32/crypto/shash.c linux-2.6.32.ovz/crypto/shash.c --- linux-2.6.32/crypto/shash.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/shash.c 2015-03-10 13:24:13.000000000 -0700 @@ -310,7 +310,13 @@ static int shash_async_export(struct aha static int shash_async_import(struct ahash_request *req, const void *in) { - return crypto_shash_import(ahash_request_ctx(req), in); + struct crypto_shash **ctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); + struct shash_desc *desc = ahash_request_ctx(req); + + desc->tfm = *ctx; + desc->flags = req->base.flags; + + return crypto_shash_import(desc, in); } static void crypto_exit_shash_ops_async(struct crypto_tfm *tfm) diff -uprN linux-2.6.32/crypto/signature/dsa.c linux-2.6.32.ovz/crypto/signature/dsa.c --- linux-2.6.32/crypto/signature/dsa.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/signature/dsa.c 2015-06-23 04:16:16.000000000 -0700 @@ -0,0 +1,97 @@ +/* dsa.c - DSA signature algorithm + * Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include +#include +#include +#include "local.h" + +/* + * perform DSA algorithm signature verification + */ +int DSA_verify(const MPI datahash, const MPI sig[], const MPI pkey[]) +{ + MPI p, q, g, y, r, s; + MPI w = NULL, u1 = NULL, u2 = NULL, v = NULL; + MPI base[3]; + MPI exp[3]; + int rc; + + if (!datahash || + !sig[0] || !sig[1] || + !pkey[0] || !pkey[1] || !pkey[2] || !pkey[3]) + return -EINVAL; + + p = pkey[0]; /* prime */ + q = pkey[1]; /* group order */ + g = pkey[2]; /* group generator */ + y = pkey[3]; /* g^x mod p */ + r = sig[0]; + s = sig[1]; + + if (!(mpi_cmp_ui(r, 0) > 0 && mpi_cmp(r, q) < 0)) { + printk("DSA_verify assertion failed [0 < r < q]\n"); + return -EKEYREJECTED; + } + + if (!(mpi_cmp_ui(s, 0) > 0 && mpi_cmp(s, q) < 0)) { + printk("DSA_verify assertion failed [0 < s < q]\n"); + return -EKEYREJECTED; + } + + rc = -ENOMEM; + w = mpi_alloc(mpi_get_nlimbs(q)); if (!w ) goto cleanup; + u1 = mpi_alloc(mpi_get_nlimbs(q)); if (!u1) goto cleanup; + u2 = mpi_alloc(mpi_get_nlimbs(q)); if (!u2) goto cleanup; + v = mpi_alloc(mpi_get_nlimbs(p)); if (!v ) goto cleanup; + + /* w = s^(-1) mod q */ + if (mpi_invm(w, s, q) < 0) + goto cleanup; + + /* u1 = (datahash * w) mod q */ + if (mpi_mulm(u1, datahash, w, q) < 0) + goto cleanup; + + /* u2 = r * w mod q */ + if (mpi_mulm(u2, r, w, q) < 0) + goto cleanup; + + /* v = g^u1 * y^u2 mod p mod q */ + base[0] = g; exp[0] = u1; + base[1] = y; exp[1] = u2; + base[2] = NULL; exp[2] = NULL; + + if (mpi_mulpowm(v, base, exp, p) < 0) + goto cleanup; + + if (mpi_fdiv_r(v, v, q) < 0) + goto cleanup; + + rc = (mpi_cmp(v, r) == 0) ? 0 : -EKEYREJECTED; + +cleanup: + mpi_free(w); + mpi_free(u1); + mpi_free(u2); + mpi_free(v); + return rc; +} +EXPORT_SYMBOL(DSA_verify); diff -uprN linux-2.6.32/crypto/signature/key.h linux-2.6.32.ovz/crypto/signature/key.h --- linux-2.6.32/crypto/signature/key.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/signature/key.h 2015-06-23 04:18:01.000000000 -0700 @@ -0,0 +1,200 @@ +const char ksign_def_public_key[] __initdata= + "\x99\x03\x2e\x04\x55\x89\x40\x05\x11\x08\x00\xe7\x71\x1d\x2a\x27" + "\xd0\xe7\xb8\x5d\xe4\x31\x5d\x45\x9e\x9e\x63\x89\xdc\x05\xd9\x47" + "\x9e\xe8\x04\x2d\xc6\x6a\xfd\xe1\xe8\x2c\x41\xd0\xed\x0c\x65\x68" + "\x61\xbd\x3e\xac\x9a\x46\xae\xe0\xdf\x76\xe2\x4d\x53\x50\xcf\x84" + "\x93\xb0\x05\x36\x18\x30\xb6\xfd\xa1\x88\x3c\xd2\x38\xd6\x2c\xe9" + "\x7f\x77\x67\x4e\xc0\x82\xb7\xac\x25\x3a\xa2\xbf\xdf\x68\x13\x01" + "\x64\x98\x20\xa7\x45\x17\x70\xe5\xe8\x0d\xf5\x73\x06\x10\xd8\xb8" + "\x6e\xb6\x4c\xb3\x07\x36\x48\x96\x19\x1d\xde\x8d\xf5\xf7\xf3\x30" + "\x0e\xb0\x55\x45\x10\xfb\xc6\x5f\x97\x07\x2e\x9f\x9e\x09\xff\xe9" + "\x9f\x17\x57\x88\x46\xa6\x88\xa3\x07\x7b\x4e\xa5\xc2\x5f\x83\x94" + "\x9d\x1e\x38\xcb\x82\x7d\x7d\x46\x0d\xcc\xce\x22\x67\xaa\x3a\x84" + "\xd7\x0b\x5b\x65\xf7\xc2\x88\xac\x75\x01\x7f\xa6\x1e\x75\x2b\x53" + "\x8e\xb2\xfc\xe4\xed\x6d\x2a\x2b\xc1\x7e\xaf\x97\x4a\x5d\xd2\x2a" + "\x69\x8c\x73\x99\xb8\xd5\xe4\xfa\x7d\x45\x5f\x46\x84\x14\xa7\x37" + "\x71\xd9\xc4\x16\x34\x8a\xee\x66\xd9\x67\xd0\xd6\x5e\x5a\x99\x54" + "\xfd\xda\x1c\x17\xa1\x55\x92\xe3\x4c\x25\xd2\x7d\xc3\x82\xd1\x8d" + "\x71\x8c\x76\xd4\x64\xba\x99\xc1\xf1\x1f\xa7\x01\x00\x97\xff\x0f" + "\x2d\x2e\xa1\xdc\x38\x6c\x0e\xaa\x0d\xd8\xb1\xfa\x86\xa5\x19\x5a" + "\x37\x18\x79\xab\x82\x47\x27\x7f\xfa\x22\xa8\x49\xed\x07\xfc\x0e" + "\xdd\x9d\x3d\x43\xa5\x03\xa7\xa4\x33\x06\xa3\x7f\xf5\xb4\x6f\x0b" + "\xec\x12\xf6\xb4\xc1\xb2\xdd\x9f\x03\x93\x75\xb2\x5d\x58\x5e\x3d" + "\x9e\x73\xaa\xde\xea\x25\x58\x8f\x54\xe6\xf6\x0e\xdf\x30\xfe\x28" + "\xf4\x2c\x7b\x80\x2a\x45\x03\x41\x4f\x0a\x6b\xd9\x4c\x39\xcb\x6d" + "\x39\x5f\xcd\x0a\xba\x00\x78\x44\x79\xba\xae\x83\x54\x52\xb7\x5c" + "\x43\x67\x95\xfa\xe1\xd5\x5d\x07\x9b\xa5\xde\xbe\xa7\xf0\x27\xc2" + "\xaf\xb5\x60\xc7\x13\xab\x08\x0f\xa0\xdb\xed\x65\x36\xc6\x29\x96" + "\x4d\x21\x31\xa4\x74\xf3\x9d\x96\x21\x45\x23\x9a\x19\x4e\x1b\xe0" + "\x62\x56\x7a\xd9\xc5\xbf\xb1\xcf\x3c\x9b\x2f\x09\xc2\x81\x18\x4e" + "\xbf\x6b\xdd\xbf\xb6\x14\x41\xc2\xba\xb3\xa2\x6c\x5a\x01\x2a\x13" + "\xb3\xea\x2f\x6b\x23\xab\x4d\x51\xdc\x26\x7b\x7d\x61\x8b\x11\x72" + "\x40\x4e\x99\x90\x1e\xdc\x38\xc5\x98\x4d\xef\x0e\xb5\xcd\x59\x2e" + "\x2c\x98\xda\xf5\x53\x9f\x84\x4a\x5b\x14\xcd\xd1\x7f\xcf\x96\xfa" + "\x4a\x47\x21\xb9\x18\x65\xdd\x0e\xbb\xda\x24\xe5\x2e\x35\x90\xbc" + "\x82\x5c\x46\x17\x79\xcd\xf0\xcd\x8c\x39\x55\x7a\x28\x50\x01\x8d" + "\x63\x9c\x38\x77\x57\x3d\x0b\xf2\x5b\x41\x4a\x30\x9a\x47\x3f\x08" + "\x00\xa7\xd1\xd3\xa8\x5d\xe0\x47\x55\xb3\x0b\xfc\xda\xdb\xcc\x78" + "\x6a\xb2\x18\x1d\x9e\x11\x7e\x20\xaa\x0c\xb9\x14\xd4\xe7\x7a\xb5" + "\x05\xeb\x91\x61\x57\x88\x36\xbd\xe6\x10\x65\x1d\x87\x10\x94\x28" + "\xb4\xae\x89\x2b\x27\x05\xcd\x26\x1d\x9a\xb5\xf7\xde\xe2\x7f\xdf" + "\xec\x61\x45\x65\x24\xce\x50\x11\xc6\x16\xab\xa6\xe9\x37\x7f\xbf" + "\x1c\x91\xa1\x21\x92\xcf\x8a\x8b\x8b\xc0\x9b\xa0\x97\x93\x12\x81" + "\xaf\xe2\xf0\xc2\x12\x3d\x14\x58\xca\x73\xb2\x29\x34\x00\xd0\x95" + "\xdd\x7c\x6a\x12\x31\x6a\x17\xd2\xd2\xf3\x3a\x9a\xdc\x34\xcf\x43" + "\xb2\x52\x0c\x02\xd0\xa9\xa4\x7c\x8e\xec\x75\xf0\x39\xa0\x92\xc0" + "\x7f\xa6\xa9\xf6\xad\xaf\xad\x93\xc2\x43\x32\x3f\x96\xd7\x15\xf4" + "\xa7\x08\x01\xe5\x83\x7f\x6e\x8f\x2e\xaa\x6e\xbf\xda\xdb\x53\x3a" + "\x7f\x47\xfa\x32\x58\x3c\x00\x8f\x1d\x21\x2a\x32\xfe\x3c\xea\xf1" + "\x22\x55\x07\x10\x49\xac\x5e\xbd\xa7\x7c\x2e\x16\xae\x38\xb9\xea" + "\x9e\xa2\x0d\xfa\x13\x81\xd2\x7e\xd8\xb4\x2f\xba\xe4\xe6\xb8\x87" + "\xb8\x4e\xb4\xee\x27\x40\x13\xe1\x6f\xc8\x45\x0a\x68\x7e\x18\xde" + "\x45\xc0\x76\xd5\x31\xe5\x25\x79\xae\xff\x5c\xb3\x8d\xcd\x5f\x27" + "\x42\xb4\x25\x52\x65\x64\x20\x48\x61\x74\x2c\x20\x49\x6e\x63\x2e" + "\x20\x28\x4b\x65\x72\x6e\x65\x6c\x20\x4d\x6f\x64\x75\x6c\x65\x20" + "\x47\x50\x47\x20\x6b\x65\x79\x29\x88\x6a\x04\x13\x11\x08\x00\x12" + "\x05\x02\x55\x89\x40\x05\x02\x1b\x23\x02\x15\x08\x02\x1e\x01\x02" + "\x17\x80\x00\x0a\x09\x10\xea\x69\xfb\x40\xe9\x03\xc4\x2b\x72\x90" + "\x00\xfe\x20\x89\x93\x0d\x77\xef\x15\xe4\xa2\xef\x08\x51\xd5\x19" + "\x50\x2f\x9e\x08\x6f\x93\xf7\x6e\x15\x5f\x39\x2d\x52\xac\xe6\xf2" + "\xcd\x30\x00\xfd\x18\x17\x8e\x89\x61\xc3\xc9\xcb\x00\x08\x76\x92" + "\x93\x6e\x42\x78\x82\x24\x32\x38\x05\x73\x34\x2c\xcd\x81\xf0\x04" + "\xe2\x57\xba\x83\x99\x01\xa2\x04\x4b\x9f\x60\x5c\x11\x04\x00\xe8" + "\x9d\xf7\xee\xa1\x42\x33\x6c\x33\xfa\x4e\x08\xca\x23\xd0\xd1\x60" + "\x16\xb1\x6a\x7f\x61\xad\xe1\xc7\x63\x81\xe3\xc8\xfc\xac\xe9\xa3" + "\x38\x7e\x16\x3a\x33\x7f\xfd\x04\x10\x76\xd8\x83\x77\x8d\x37\x53" + "\x95\x61\x1d\xaf\x3c\xc9\xc3\x30\xff\x69\x83\xb0\x70\xa9\x44\xfb" + "\xee\xab\x8a\x04\xdb\x2a\xfd\xee\xe2\x89\x2c\xab\x3a\xf9\x7a\x7b" + "\x04\xd8\x4a\x14\x1f\x78\x4c\xc2\xf9\xf3\xc8\xe2\x7a\x0f\xde\xef" + "\x96\xe0\x61\xb1\xbc\x4c\xec\x08\xf4\x5a\x94\x6f\x26\x31\xf1\xea" + "\x73\x99\xae\x34\xd9\xb1\x23\x20\x64\x53\x03\xf2\xeb\x68\x13\x00" + "\xa0\x9d\xe8\xbb\x78\x16\xfd\x70\xff\x87\x95\xf5\x8e\xe0\xcd\x6c" + "\x74\x82\x09\x57\x2d\x04\x00\x9e\x25\x32\x03\x39\x27\xd0\x12\x11" + "\xb9\x3a\x13\x23\x7d\xb4\xb0\x50\x8d\x23\xfd\xf4\x00\x98\x68\x93" + "\xa9\xa1\x06\x0d\x6e\x2f\x3b\xca\x42\xbd\xb7\xac\xcc\xea\x26\x7e" + "\x8e\xc2\x73\xfc\x44\xc4\xfc\xd0\xdb\x9a\xcc\xed\xe3\x97\x47\x24" + "\xbd\x59\xc1\x39\x6f\xac\x8b\xf4\xd8\xd3\xfb\x57\xfb\x65\x9e\x1a" + "\x7d\x69\x19\x2a\xcd\x77\x87\xdd\xb1\x49\x4b\x94\x76\x71\x4c\xc5" + "\x5d\x35\xe8\x5c\xc7\x4d\x99\xf8\x23\x19\x4f\x77\x3e\x63\xab\xee" + "\x02\x32\xc5\x78\x61\x0d\x14\x6d\x13\x7a\x69\x51\xf4\xd3\x1b\xd5" + "\x34\x17\x11\x15\xe8\x0d\xb0\x03\xfe\x30\xf1\x34\x14\x74\xbf\x53" + "\xef\x20\xac\x6e\xc4\x8e\x82\xc1\x3a\x1a\x55\x65\x6a\x97\x6a\x3b" + "\x0f\x39\x00\x53\x15\x29\xa0\xf0\xe2\xe1\xf8\x3a\x9a\xb7\x71\xce" + "\x8b\x13\xa9\xc1\xfb\xb7\xa1\x13\xb8\x4f\xac\xfd\x91\xd8\x57\xa0" + "\x02\x85\x76\x88\xf8\x37\xc9\x79\x7f\xe9\x69\xe5\x85\x01\x7f\xe1" + "\x83\xf4\x0b\xcc\xcf\xf8\xc5\x20\x4a\xb5\x05\x07\x54\xd0\x1f\x64" + "\x15\x35\xe9\xb3\xdc\xf4\xe7\x36\x44\xe0\xc0\x68\x9c\xa4\x02\xa3" + "\x79\xe7\xe2\xb9\x01\xa0\x98\x0d\xc2\xf5\x3b\x4a\x08\x4a\x8e\xb3" + "\x1b\xe9\xa4\x9e\x85\x33\x04\xd3\x98\xb4\x44\x52\x65\x64\x20\x48" + "\x61\x74\x20\x45\x6e\x74\x65\x72\x70\x72\x69\x73\x65\x20\x4c\x69" + "\x6e\x75\x78\x20\x44\x72\x69\x76\x65\x72\x20\x55\x70\x64\x61\x74" + "\x65\x20\x50\x72\x6f\x67\x72\x61\x6d\x20\x3c\x73\x65\x63\x61\x6c" + "\x65\x72\x74\x40\x72\x65\x64\x68\x61\x74\x2e\x63\x6f\x6d\x3e\x88" + "\x60\x04\x13\x11\x02\x00\x20\x05\x02\x4b\xe9\x39\x1c\x02\x1b\x03" + "\x06\x0b\x09\x08\x07\x03\x02\x04\x15\x02\x08\x03\x04\x16\x02\x03" + "\x01\x02\x1e\x01\x02\x17\x80\x00\x0a\x09\x10\xd4\xa2\x6c\x9c\xcd" + "\x09\xbe\xda\xc6\xa6\x00\x9e\x33\x63\x02\x64\x37\xe3\x19\x36\x54" + "\xf4\xcb\x60\xb1\xd8\xe9\x71\x35\x7a\xb8\xd1\x00\x9e\x3f\x4d\xca" + "\xbf\xf6\x1f\x15\x34\x4e\x04\x5d\x62\xcd\x5c\x8a\x11\x53\xe3\x01" + "\x1a\x99\x03\x2e\x04\x53\xe0\xb0\x0c\x11\x08\x00\xec\xc2\xb1\xad" + "\x56\x41\xfa\xa6\xe1\xde\xfc\xae\xc0\xac\xc0\xf2\x64\x87\xa9\x45" + "\x7b\x7f\x51\x23\xf6\x2a\x7c\x6e\x69\x51\x92\x24\xc9\xcb\xfb\x31" + "\x83\xfc\x0e\x98\x8e\xbe\x38\xd3\x6b\xc8\x85\x09\x44\x79\x78\x12" + "\xc0\x92\x69\xec\x53\x1f\x3a\x4e\x1f\x43\xc6\x60\x8b\x4d\x3b\x25" + "\x81\xa2\x2f\x7b\xca\xc5\xe3\xc5\xd9\x26\x20\x1e\xf5\x9e\x0f\x85" + "\x80\x84\x57\x8e\x29\x37\x4b\xcf\xf5\x6c\x4f\xe0\xf6\x92\x7d\xee" + "\xe7\xdd\x9c\xdd\xa6\x49\x10\x92\x05\xb1\x9e\x99\x44\x2d\xf8\x19" + "\xa0\xf1\x2a\xe1\x85\x8d\x1a\x45\x04\x70\x8b\x54\x79\x95\x59\xff" + "\x1b\x7b\x6e\x13\x63\x72\x86\x75\x75\x66\xf6\x3b\xdf\xbd\x17\xeb" + "\xd4\x7d\x7f\x06\x6a\xe9\x54\x68\xe2\x6d\x72\x24\xa1\xec\x9c\x6a" + "\x98\x4f\x87\xe9\xe0\xf9\x97\xc7\x91\xa7\xae\x86\x91\xac\x50\x8c" + "\xab\xbe\x3d\xd7\x7e\xf9\x2b\xae\x0c\x07\x93\x10\xa5\x36\xd3\xd7" + "\x74\xf6\x0e\x28\xe4\x0a\xcc\xd2\x21\xa3\x53\x44\xf0\x46\xa4\xfb" + "\x27\x43\x1c\xeb\x93\x49\x7c\xa2\x80\xe7\x70\x7a\xcf\xc7\x8a\x53" + "\x6f\x3e\x60\x1a\x9a\xfb\xed\x83\x0f\x3e\x11\x87\x1b\xfa\x11\x44" + "\x1d\x56\x5c\x7a\x83\xcd\xa8\x7a\x16\xcd\x42\xe7\x01\x00\xda\xf7" + "\x6a\x85\xd0\xa5\xa8\x08\xa0\xc5\x2b\x9c\x9b\xd6\xcc\x70\x01\x00" + "\x1c\xbc\x33\x86\xa6\xe6\x68\xa1\x41\x1e\x5a\xb6\xfb\xb3\x08\x00" + "\xe0\x60\x89\xa3\xd1\x17\x80\x0c\x53\x88\xcf\x79\xa2\x40\x83\x7c" + "\xe7\x3b\x9f\xe6\xf8\x28\xe8\x3c\x18\x30\x40\xea\xce\x80\x35\xf6" + "\x33\x62\xf6\x58\x5b\x4f\x6f\x24\x3c\x0e\x8c\x03\x1e\xdd\xf6\xee" + "\x6a\x70\xa2\x66\x7e\x69\x00\xbe\xc9\xe0\x7c\x34\x8f\xca\x74\xcf" + "\x05\x10\x7d\x88\xd3\x8a\x00\x94\x11\x1c\x37\x77\x71\x7b\x85\xe9" + "\xf9\x97\xbe\x59\x54\x39\xf7\xa9\x58\x19\xbc\xf0\x36\xec\x30\x4b" + "\x53\x06\x72\x7b\xc1\xb7\xa4\x64\x2d\x2d\xfd\xfe\xb8\xfc\xb6\x73" + "\xed\x13\x10\xd3\xe8\x45\x87\x49\x0d\xac\x17\x0f\x82\xef\x2e\x04" + "\x9d\x5e\x59\xf2\x3d\x3a\x36\xe0\xc1\x23\xcf\x2e\x6b\x28\x7b\x82" + "\x4e\x4c\x70\xa4\xd0\x00\xbc\xa3\x74\xa9\x71\x2a\x7c\xf4\x0f\xdf" + "\xaa\x15\x43\xd6\x73\xcf\x3e\x6c\xa0\xa4\xad\xc5\xa2\xa7\xa0\x8f" + "\xd8\x84\xf6\x1a\x9a\x61\xfe\x37\x4b\x78\x64\x1a\x80\x75\xae\x66" + "\x41\xb6\x9f\x53\x97\xe6\x91\x06\xb7\xb2\x6f\x05\xde\xcc\xc6\xd3" + "\x1d\xef\x43\x1e\x53\xe0\xcf\xce\x43\xff\x08\x7e\x92\x06\xc1\x3e" + "\xfe\x21\x4b\x89\x05\xc1\x0e\x59\xb3\xbb\x70\x9c\xaf\xb0\xd7\x26" + "\x82\xa9\x61\xad\x6d\x48\x7a\x4b\x20\x8a\xdf\xd0\x56\x3b\x64\xa6" + "\x08\x00\xaa\x18\x0e\xbd\x93\xa3\xf9\x82\x7f\x33\x17\x84\x16\x27" + "\x18\xe0\xe3\x67\x68\x28\xe6\x27\x5f\x47\x39\x20\x16\x94\xed\x41" + "\xd5\x4e\xac\x9e\x7a\xd0\xe0\x67\xef\x23\xdc\x37\xc2\xfe\x9f\x21" + "\x20\x87\x79\xf4\xe0\xe8\x03\x33\xe8\xad\x28\xbc\x74\x14\x6f\x2f" + "\x84\x53\x08\x9a\x44\x6d\x22\x40\xfc\xa3\x82\xf3\xd2\xdc\x7a\xed" + "\x40\x15\x3b\x1b\xc8\x46\x67\x53\x22\x7b\x71\xc6\xed\x72\x65\x6a" + "\x46\xa1\xbf\x18\xa2\xdf\x37\x49\x02\x12\x8f\x95\x65\xfc\x84\x8e" + "\x7d\x51\x25\xfb\xc7\xe9\x10\xfc\x1e\x81\x28\xe8\x91\xaf\x17\x3c" + "\x60\x6f\xdc\x7c\x4a\xf0\x0d\x44\x62\xbd\x79\xf2\xe5\x8c\xa0\x61" + "\x27\xde\x27\xb3\x1f\x40\x1e\x81\x0d\xff\x96\xd4\x36\x17\x5b\xe0" + "\xc3\xd8\x71\xc7\x3b\x60\x62\x94\x4b\x79\x19\x2c\xcf\x38\x1a\x2d" + "\x65\x57\xf4\x6f\xc3\x09\x71\x6d\x6a\xfe\xfc\xe0\x79\xd8\xbd\x27" + "\x5c\x30\x57\x98\x0a\x06\xdc\xb5\x1a\x36\xac\x5b\x1a\x6d\x52\xd8" + "\x1c\x6a\xc4\x1e\xd7\xf9\x7a\xf7\x51\x86\xde\x59\x16\x8c\xb3\x04" + "\xae\x95\x1e\x31\x5d\x21\x24\xa6\x70\x6d\x5e\xdf\x97\xd7\x4e\xd0" + "\x51\xa5\xe5\xb2\xe0\x75\xdd\x91\x08\xf6\x8b\x00\x45\x36\xde\xc1" + "\xc6\xae\xb4\x4c\x52\x65\x64\x20\x48\x61\x74\x20\x45\x6e\x74\x65" + "\x72\x70\x72\x69\x73\x65\x20\x4c\x69\x6e\x75\x78\x20\x44\x72\x69" + "\x76\x65\x72\x20\x55\x70\x64\x61\x74\x65\x20\x50\x72\x6f\x67\x72" + "\x61\x6d\x20\x28\x6b\x65\x79\x20\x34\x29\x20\x3c\x73\x65\x63\x61" + "\x6c\x65\x72\x74\x40\x72\x65\x64\x68\x61\x74\x2e\x63\x6f\x6d\x3e" + "\x88\x79\x04\x13\x11\x08\x00\x21\x05\x02\x53\xe0\xb0\x0c\x02\x1b" + "\x03\x06\x0b\x09\x08\x07\x03\x02\x06\x15\x08\x02\x09\x0a\x0b\x03" + "\x16\x02\x01\x02\x1e\x01\x02\x17\x80\x00\x0a\x09\x10\xcb\x44\x5e" + "\x0d\xa3\xea\x1f\x65\x7d\x0e\x00\xfe\x2e\x0f\xda\x7e\x62\x15\xd8" + "\x87\xac\x4a\xb7\x40\xc6\x36\x25\xf0\x67\xa5\xe7\x8c\x81\xab\x21" + "\xa1\x52\x76\x8d\x51\xa6\xdf\x9b\xc2\x01\x00\x81\xef\x30\xf5\x5b" + "\x92\xf3\x6a\x97\x2c\x50\x6b\x16\xb5\x3b\x44\x56\x61\x2d\x94\x2c" + "\x51\x9f\xf9\xe1\x64\xf6\xa0\x92\x7e\x19\xe3\x99\x01\xa2\x04\x55" + "\x81\x9d\xd0\x11\x04\x00\x90\x62\xa0\x7e\x2c\x2b\x87\x07\xb8\x8b" + "\x1f\xcd\xfc\x67\x27\x52\xa5\xdc\x1e\xe5\xdf\xcd\xaf\xd5\x1c\xde" + "\x45\x2d\x25\xdd\xcf\x63\x44\x17\x98\x4e\xad\xb3\x5e\x18\x18\xc5" + "\xe3\x21\xfd\x4b\xd7\xdf\xd2\x96\x33\xca\x14\x84\x08\x76\xb5\x55" + "\x20\x54\x72\x08\xf3\x3c\x94\x16\xcf\xc9\xad\x34\xc1\x4c\x31\x7a" + "\xe2\xe3\x67\x8d\x8b\x49\xef\x87\xeb\x19\x15\xf4\x05\x83\x7c\x31" + "\xec\x6c\xff\x38\x94\xef\x7f\xaa\x85\x90\x1f\xf5\x3e\x1c\x85\x93" + "\xdf\xc6\xec\xcb\xd0\x46\xe9\xfa\xa8\x4f\x81\x90\x50\xe5\x34\x34" + "\x76\x76\xf5\x1e\xda\x2f\x00\xa0\xec\xa9\x04\xfe\x64\x2c\xe1\x69" + "\xce\x08\x8e\xd0\x80\x9d\x2b\xbe\x30\x65\x8c\xa3\x03\xfe\x3a\xab" + "\x60\xd1\xa3\xc1\x83\xbf\xe5\x75\xce\x00\x3d\x68\xd6\xbd\xcd\x82" + "\x8c\x05\x17\xcd\x5b\xcc\x53\x8c\x11\x17\xa2\x14\x1a\x79\x83\x1e" + "\x1f\x66\xae\xd8\xa2\x82\xa0\xa8\xb1\x8e\x99\xe7\xb3\xe9\xe7\x49" + "\x07\x28\x25\x3c\x7a\x57\xfb\xba\x01\x3c\xb7\x68\x22\x28\x77\x8c" + "\xc6\xdb\x0c\x9f\x6c\xe6\x0c\x0f\x80\x3f\x1b\x61\x1e\x39\x89\x19" + "\x5c\x3e\x9f\x74\x7b\x83\x1e\xcd\xb6\xd8\xd3\x0f\x67\x6a\xff\xac" + "\x0e\x3b\x1f\x3b\xb6\x1d\xbc\x3e\x10\x7b\x1e\x92\x1a\xac\x69\x62" + "\x15\xeb\x73\x83\x57\xa0\xa3\x8e\xd5\x44\x02\x9f\x02\xee\x03\xff" + "\x71\x62\xf1\x1b\x58\xdf\xe8\x5e\x0e\x9a\x20\xed\x6f\xe4\x9c\xc0" + "\xea\xcd\x04\xb5\x15\x5d\xba\xcc\x15\x48\xa1\x8a\x24\xab\xd5\xf6" + "\x95\xe7\x2a\x61\x9e\x18\x53\x95\x52\x96\x60\x4e\x10\x30\x99\x65" + "\x99\x1c\xc4\x1a\x31\xe6\x80\x26\x8b\x8d\x57\xe8\xa7\x1c\xdd\x41" + "\x44\x53\xd9\xb2\x14\xb8\xea\x6f\x41\x67\x3c\x4b\x52\x6d\x77\x27" + "\x0d\x97\x10\x4a\x84\x33\x2f\x37\xe6\x06\x72\xd4\xb6\x51\xe6\x45" + "\x46\x8c\x62\x58\xa1\x8e\x38\xda\xee\xca\x68\xff\xa7\xbb\x14\x49" + "\x3d\xa9\xbf\xe8\x30\xec\x5e\xd4\x36\x09\xa4\x4e\xdc\x54\xf5\x86" + "\xb4\x27\x50\x61\x72\x61\x6c\x6c\x65\x6c\x73\x2c\x20\x49\x6e\x63" + "\x2e\x20\x28\x4b\x65\x72\x6e\x65\x6c\x20\x4d\x6f\x64\x75\x6c\x65" + "\x20\x47\x50\x47\x20\x6b\x65\x79\x29\x88\x62\x04\x13\x11\x02\x00" + "\x22\x05\x02\x55\x81\x9d\xd0\x02\x1b\x23\x06\x0b\x09\x08\x07\x03" + "\x02\x06\x15\x08\x02\x09\x0a\x0b\x04\x16\x02\x03\x01\x02\x1e\x01" + "\x02\x17\x80\x00\x0a\x09\x10\x50\x18\x09\xdc\xcb\x3f\xc8\x68\x42" + "\xc6\x00\xa0\x82\x91\xaf\x50\xd5\x1c\x40\x68\xd1\x6b\xe0\x93\xbf" + "\x84\x14\x36\x24\x4f\x4c\x06\x00\x9d\x17\x67\x9d\xd6\x93\x4e\x34" + "\xf3\x33\x31\x74\x44\x06\x03\x44\x49\x47\x7a\xc9\x56" + ; + +const int ksign_def_public_key_size = 3133; diff -uprN linux-2.6.32/crypto/signature/ksign.c linux-2.6.32.ovz/crypto/signature/ksign.c --- linux-2.6.32/crypto/signature/ksign.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/signature/ksign.c 2015-03-10 13:24:07.000000000 -0700 @@ -0,0 +1,270 @@ +/* ksign.c: signature checker + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "local.h" + +int ksign_debug; +core_param(ksign_debug, ksign_debug, bool, 0644); + +#define _debug(FMT, ...) \ + do { \ + if (unlikely(ksign_debug)) \ + printk(KERN_DEBUG FMT, ##__VA_ARGS__); \ + } while(0) + +const char ksign_hash_algo_names[DIGEST_ALGO__LAST][7] = { + [DIGEST_ALGO_SHA1] = "sha1", + [DIGEST_ALGO_SHA256] = "sha256", + [DIGEST_ALGO_SHA384] = "sha384", + [DIGEST_ALGO_SHA512] = "sha512", + [DIGEST_ALGO_SHA224] = "sha224", +}; + +const u16 ksign_hash_algo_sizes[DIGEST_ALGO__LAST] = { + [DIGEST_ALGO_SHA1] = 160 / 8, + [DIGEST_ALGO_SHA256] = 256 / 8, + [DIGEST_ALGO_SHA384] = 384 / 8, + [DIGEST_ALGO_SHA512] = 512 / 8, + [DIGEST_ALGO_SHA224] = 224 / 8, +}; + +/* + * check the signature which is contained in SIG. + */ +static int ksign_signature_check(const struct ksign_signature *sig, + struct shash_desc *digest) +{ + struct ksign_public_key *pk; + uint8_t digest_buf[SHA_MAX_DIGEST_SIZE]; + MPI result = NULL; + int rc = 0; + + pk = ksign_get_public_key(sig->keyid); + if (!pk) { + printk("ksign: module signed with unknown public key\n"); + printk("- signature keyid: %08x%08x ver=%u\n", + sig->keyid[0], sig->keyid[1], sig->version); + return -ENOKEY; + } + + if (pk->timestamp > sig->timestamp) + printk("ksign:" + " public key is %lu seconds newer than the signature" + " [%lx < %lx]\n", + pk->timestamp - sig->timestamp, + pk->timestamp, sig->timestamp); + + /* complete the digest */ + if (sig->version >= 4) + digest_putc(digest, sig->version); + digest_putc(digest, sig->sig_class); + + if (sig->version < 4) { + u32 a = sig->timestamp; + digest_putc(digest, (a >> 24) & 0xff); + digest_putc(digest, (a >> 16) & 0xff); + digest_putc(digest, (a >> 8) & 0xff); + digest_putc(digest, (a >> 0) & 0xff); + } + else { + uint8_t buf[6]; + size_t n; + digest_putc(digest, PUBKEY_ALGO_DSA); + digest_putc(digest, sig->digest_algo); + if (sig->hashed_data) { + n = (sig->hashed_data[0] << 8) | sig->hashed_data[1]; + digest_write(digest, sig->hashed_data, n + 2); + n += 6; + } + else { + n = 6; + } + + /* add some magic */ + buf[0] = sig->version; + buf[1] = 0xff; + buf[2] = n >> 24; + buf[3] = n >> 16; + buf[4] = n >> 8; + buf[5] = n; + digest_write(digest, buf, 6); + } + + crypto_shash_final(digest, digest_buf); + + rc = -ENOMEM; + result = mpi_alloc((ksign_hash_algo_sizes[sig->digest_algo] + + BYTES_PER_MPI_LIMB - 1) / + BYTES_PER_MPI_LIMB); + if (!result) + goto cleanup; + + rc = mpi_set_buffer(result, digest_buf, ksign_hash_algo_sizes[sig->digest_algo], 0); + if (rc < 0) + goto cleanup; + + rc = DSA_verify(result, sig->data, pk->pkey); + + cleanup: + mpi_free(result); + ksign_put_public_key(pk); + + return rc; +} + +/* + * examine the signatures that are parsed out of the signature data - we keep + * the first one that's appropriate and ignore the rest + * - return 0 if signature of interest (sig not freed by caller) + * - return 1 if no interest (caller frees) + */ +static int ksign_grab_signature(struct ksign_signature *sig, void *fnxdata) +{ + struct ksign_signature **_sig = fnxdata; + + if (sig->sig_class != 0x00) { + _debug("ksign: standalone signature of class 0x%02x\n", + sig->sig_class); + return 1; + } + + if (*_sig) + return 1; + + *_sig = sig; + return 0; +} + +/* + * verify the signature of some data with one of the kernel's known public keys + * - the digest supplied should have the data to be checked already loaded + * in to it + */ +int ksign_verify_signature(const char *sigdata, unsigned sig_size, + struct shash_desc *partial_digest) +{ + struct ksign_signature *sig = NULL; + struct shash_desc *digest = NULL; + uint8_t digest_buf[SHA_MAX_DIGEST_SIZE]; + void *export_buf = NULL; + int retval, loop; + + /* copy the current state of the digest, something that we have to do + * by exporting the old state and importing into the new state + */ + export_buf = kmalloc(crypto_shash_statesize(partial_digest->tfm), + GFP_KERNEL); + if (!export_buf) + return -ENOMEM; + + retval = crypto_shash_export(partial_digest, export_buf); + if (retval < 0) + goto cleanup; + + retval = -ENOMEM; + digest = kmalloc(sizeof(*partial_digest) + + crypto_shash_descsize(partial_digest->tfm), + GFP_KERNEL); + if (!digest) + goto cleanup; + + digest->tfm = partial_digest->tfm; + digest->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + + if (ksign_debug) { + /* print the partial digest for debugging purposes */ + retval = crypto_shash_import(digest, export_buf); + if (retval < 0) + goto cleanup; + + crypto_shash_final(digest, digest_buf); + printk(KERN_WARNING "Modsign digest: "); + for (loop = 0; loop < sizeof(digest_buf); loop++) + printk("%02x", digest_buf[loop]); + printk("\n"); + } + + retval = crypto_shash_import(digest, export_buf); + if (retval < 0) + goto cleanup; + + kfree(export_buf); + export_buf = NULL; + + /* parse the signature data to get the actual signature */ + retval = ksign_parse_packets(sigdata, sig_size, + &ksign_grab_signature, NULL, NULL, + &sig); + if (retval < 0) + goto cleanup; + + if (!sig) { + printk(KERN_NOTICE + "Couldn't find valid DSA signature in module\n"); + retval = -ENOENT; + goto cleanup; + } + + _debug("signature keyid: %08x%08x ver=%u\n", + sig->keyid[0], sig->keyid[1], sig->version); + + /* check the data SHA256 transformation against the public key */ + retval = ksign_signature_check(sig, digest); + switch (retval) { + case 0: + _debug("ksign: Signature check succeeded\n"); + break; + case -ENOMEM: + _debug("ksign: Signature check ENOMEM\n"); + break; + default: + _debug("ksign: Signature check failed: %d\n", retval); + if (retval != -ENOKEY) + retval = -EKEYREJECTED; + break; + } + +cleanup: + if (sig) + ksign_free_signature(sig); + kfree(export_buf); + kfree(digest); + return retval; +} + +/* + * preparse the signature and extract the digest algorithm. + */ +int ksign_get_signature_digest_algo(const char *sigdata, unsigned sig_size, + const char **digest_name) +{ + struct ksign_signature *sig = NULL; + int ret; + + ret = ksign_parse_packets(sigdata, sig_size, + &ksign_grab_signature, NULL, NULL, + &sig); + if (ret < 0) + return ret; + + if (!sig) { + printk(KERN_NOTICE + "Couldn't find valid DSA signature in module\n"); + return -ENOENT; + } + + *digest_name = ksign_hash_algo_names[sig->digest_algo]; + + ksign_free_signature(sig); + return ret; +} diff -uprN linux-2.6.32/crypto/signature/ksign-keyring.c linux-2.6.32.ovz/crypto/signature/ksign-keyring.c --- linux-2.6.32/crypto/signature/ksign-keyring.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/signature/ksign-keyring.c 2015-03-10 13:23:18.000000000 -0700 @@ -0,0 +1,116 @@ +/* ksign-keyring.c: public key cache + * + * Copyright (C) 2001 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This file is derived from part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include +#include "local.h" + +static LIST_HEAD(keyring); +static DECLARE_RWSEM(keyring_sem); + +/* + * handle a public key element parsed from the keyring blob + */ +static int add_keyblock_key(struct ksign_public_key *pk, void *data) +{ + printk("- Added public key %X%X\n", pk->keyid[0], pk->keyid[1]); + + if (pk->expiredate && pk->expiredate < get_seconds()) + printk(" - public key has expired\n"); + + if (pk->timestamp > get_seconds()) + printk(" - key was been created %lu seconds in future\n", + pk->timestamp - get_seconds()); + + atomic_inc(&pk->count); + + down_write(&keyring_sem); + list_add_tail(&pk->link, &keyring); + up_write(&keyring_sem); + + return 0; +} + +/* + * handle a user ID element parsed from the keyring blob + */ +static int add_keyblock_uid(struct ksign_user_id *uid, void *data) +{ + printk("- User ID: %s\n", uid->name); + return 1; +} + +/* + * add the keys from a ASN.1 encoded blob into the keyring + */ +int ksign_load_keyring_from_buffer(const void *buffer, size_t size) +{ + printk("Loading keyring\n"); + + return ksign_parse_packets((const uint8_t *) buffer, + size, + NULL, + add_keyblock_key, + add_keyblock_uid, + NULL); +} + +/* + * find a public key by ID + */ +struct ksign_public_key *ksign_get_public_key(const uint32_t *keyid) +{ + struct ksign_public_key *pk; + + down_read(&keyring_sem); + + list_for_each_entry(pk, &keyring, link) { + if (memcmp(pk->keyid, keyid, sizeof(pk->keyid)) == 0) { + atomic_inc(&pk->count); + goto found; + } + } + + pk = NULL; + +found: + up_read(&keyring_sem); + return pk; +} + +/* + * clear the public-key keyring + */ +void ksign_clear_keyring(void) +{ + struct ksign_public_key *pk; + + down_write(&keyring_sem); + + while (!list_empty(&keyring)) { + pk = list_entry(keyring.next, struct ksign_public_key, link); + list_del(&pk->link); + + ksign_put_public_key(pk); + } + + up_write(&keyring_sem); +} diff -uprN linux-2.6.32/crypto/signature/ksign-parse.c linux-2.6.32.ovz/crypto/signature/ksign-parse.c --- linux-2.6.32/crypto/signature/ksign-parse.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/signature/ksign-parse.c 2015-03-10 13:24:14.000000000 -0700 @@ -0,0 +1,642 @@ +/* parse packet data + * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. + * + * This file is part of GnuPG. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include +#include +#include +#include +#include "local.h" + +static inline uint32_t buffer_to_u32(const uint8_t *buffer) +{ + uint32_t a; + a = *buffer << 24; + a |= buffer[1] << 16; + a |= buffer[2] << 8; + a |= buffer[3]; + return a; +} + +static inline uint16_t read_16(const uint8_t **datap) +{ + uint16_t a; + a = *(*datap)++ << 8; + a |= *(*datap)++; + return a; +} + +static inline uint32_t read_32(const uint8_t **datap) +{ + uint32_t a; + a = *(*datap)++ << 24; + a |= *(*datap)++ << 16; + a |= *(*datap)++ << 8; + a |= *(*datap)++; + return a; +} + +void ksign_free_signature(struct ksign_signature *sig) +{ + int i; + + if (sig) { + for (i = 0; i < DSA_NSIG; i++) + mpi_free(sig->data[i]); + kfree(sig->hashed_data); + kfree(sig->unhashed_data); + kfree(sig); + } +} + +void ksign_free_public_key(struct ksign_public_key *pk) +{ + int i; + + if (pk) { + for (i = 0; i < DSA_NPKEY; i++) + mpi_free(pk->pkey[i]); + kfree(pk); + } +} + +void ksign_free_user_id(struct ksign_user_id *uid) +{ + kfree(uid); +} + +/* + * + */ +static void ksign_calc_pk_keyid(struct shash_desc *digest, + struct ksign_public_key *pk) +{ + unsigned n; + unsigned nb[DSA_NPKEY]; + unsigned nn[DSA_NPKEY]; + uint8_t *pp[DSA_NPKEY]; + uint32_t a32; + int i; + int npkey = DSA_NPKEY; + + n = pk->version < 4 ? 8 : 6; + for (i = 0; i < npkey; i++) { + nb[i] = mpi_get_nbits(pk->pkey[i]); + pp[i] = mpi_get_buffer( pk->pkey[i], nn + i, NULL); + n += 2 + nn[i]; + } + + digest_putc(digest, 0x99); /* ctb */ + digest_putc(digest, n >> 8); /* 2 uint8_t length header */ + digest_putc(digest, n); + + if (pk->version < 4) + digest_putc(digest, 3); + else + digest_putc(digest, 4); + + a32 = pk->timestamp; + digest_putc(digest, a32 >> 24 ); + digest_putc(digest, a32 >> 16 ); + digest_putc(digest, a32 >> 8 ); + digest_putc(digest, a32 >> 0 ); + + if (pk->version < 4) { + uint16_t a16; + + if( pk->expiredate ) + a16 = (uint16_t) + ((pk->expiredate - pk->timestamp) / 86400L); + else + a16 = 0; + digest_putc(digest, a16 >> 8); + digest_putc(digest, a16 >> 0); + } + + digest_putc(digest, PUBKEY_ALGO_DSA); + + for (i = 0; i < npkey; i++) { + digest_putc(digest, nb[i] >> 8); + digest_putc(digest, nb[i]); + digest_write(digest, pp[i], nn[i]); + kfree(pp[i]); + } +} + +/* + * parse a user ID embedded in a signature + */ +static int ksign_parse_user_id(const uint8_t *datap, const uint8_t *endp, + ksign_user_id_actor_t uidfnx, void *fnxdata) +{ + struct ksign_user_id *uid; + int rc = 0; + int n; + + if (!uidfnx) + return 0; + + n = endp - datap; + uid = kmalloc(sizeof(*uid) + n + 1, GFP_KERNEL); + if (!uid) + return -ENOMEM; + uid->len = n; + + memcpy(uid->name, datap, n); + uid->name[n] = 0; + + rc = uidfnx(uid, fnxdata); + if (rc == 0) + return rc; /* uidfnx keeps the record */ + if (rc == 1) + rc = 0; + + ksign_free_user_id(uid); + return rc; +} + +/* + * extract a public key embedded in a signature + */ +static int ksign_parse_key(const uint8_t *datap, const uint8_t *endp, + uint8_t *hdr, int hdrlen, + ksign_public_key_actor_t pkfnx, void *fnxdata) +{ + struct ksign_public_key *pk; + struct crypto_shash *tfm; + struct shash_desc *digest; + unsigned long timestamp, expiredate; + uint8_t digest_buf[SHA1_DIGEST_SIZE]; + int i, version; + int is_v4 = 0; + int rc = 0; + + if (endp - datap < 12) { + printk("ksign: public key packet too short\n"); + return -EBADMSG; + } + + version = *datap++; + switch (version) { + case 4: + is_v4 = 1; + case 2: + case 3: + break; + default: + printk("ksign: public key packet with unknown version %d\n", + version); + return -EBADMSG; + } + + timestamp = read_32(&datap); + if (is_v4) { + expiredate = 0; /* have to get it from the selfsignature */ + } else { + unsigned short ndays; + ndays = read_16(&datap); + if (ndays) + expiredate = timestamp + ndays * 86400L; + else + expiredate = 0; + } + + if (*datap++ != PUBKEY_ALGO_DSA) { + printk("ksign: public key packet with unknown version %d\n", + version); + return 0; + } + + /* extract the stuff from the DSA public key */ + pk = kzalloc(sizeof(struct ksign_public_key), GFP_KERNEL); + if (!pk) + return -ENOMEM; + + atomic_set(&pk->count, 1); + pk->timestamp = timestamp; + pk->expiredate = expiredate; + pk->hdrbytes = hdrlen; + pk->version = version; + + for (i = 0; i < DSA_NPKEY; i++) { + unsigned int remaining = endp - datap; + pk->pkey[i] = mpi_read_from_buffer(datap, &remaining); + datap += remaining; + } + + tfm = crypto_alloc_shash("sha1", 0, 0); + if (IS_ERR(tfm)) { + rc = PTR_ERR(tfm); + goto cleanup_pubkey; + } + + rc = -ENOMEM; + digest = kmalloc(sizeof(*digest) + crypto_shash_descsize(tfm), + GFP_KERNEL); + if (!digest) + goto cleanup_tfm; + + digest->tfm = tfm; + digest->flags = CRYPTO_TFM_REQ_MAY_SLEEP; + rc = crypto_shash_init(digest); + if (rc < 0) + goto cleanup_digest; + + ksign_calc_pk_keyid(digest, pk); + + rc = crypto_shash_final(digest, digest_buf); + if (rc < 0) + goto cleanup_digest; + + pk->keyid[0] = (digest_buf[12] << 24 | digest_buf[13] << 16 | + digest_buf[14] << 8 | digest_buf[15]); + pk->keyid[1] = (digest_buf[16] << 24 | digest_buf[17] << 16 | + digest_buf[18] << 8 | digest_buf[19]); + + rc = 0; + if (pkfnx) + rc = pkfnx(pk, fnxdata); + +cleanup_digest: + kfree(digest); +cleanup_tfm: + crypto_free_shash(tfm); +cleanup_pubkey: + ksign_put_public_key(pk); + return rc; +} + +/* + * parse v4 subpacket data + * - RFC 2440: 5.2.3.1 + */ +static int ksign_parse_v4_subpackets(struct ksign_signature *sig, + const uint8_t *datap, + size_t n) +{ + size_t m; + uint8_t type; + + while (n > 0) { + /* extract the length */ + m = *datap++; + n--; + if (m == 0) + continue; + if (m > 192 && m < 255) { + if (n < 192) + goto too_short; + m -= 192; + m <<= 8; + m += *datap++; + m += 192; + n--; + } else if (m == 255) { + if (n < 4) + goto too_short; + m = read_32(&datap); + n -= 4; + if (m == 0) + goto too_short; + } + if (m > n) + return -EBADMSG; + n -= m; + + type = *datap++; + m--; + + switch (type & 0x7f) { + case SIGSUBPKT_SIG_CREATED: + if (m < 4) + goto too_short; + sig->timestamp = read_32(&datap); + sig->have |= KSIGN_HAVE_TIMESTAMP; + break; + + case SIGSUBPKT_ISSUER: + if (m < 8) + goto too_short; + sig->keyid[0] = read_32(&datap); + sig->keyid[1] = read_32(&datap); + sig->have |= KSIGN_HAVE_KEYID; + break; + + default: + datap += m; + break; + } + } + + BUG_ON(n != 0); + return 0; + +too_short: + printk("ksign: signature subpkt too short\n"); + return -EBADMSG; +} + +/* + * extract signature data embedded in a signature + */ +static int ksign_parse_signature(const uint8_t *datap, const uint8_t *endp, + ksign_signature_actor_t sigfnx, void *fnxdata) +{ + struct ksign_signature *sig; + size_t n; + int version, is_v4 = 0; + int rc; + int i; + + if (endp - datap < 16) { + printk("ksign: signature packet too short\n"); + return -EBADMSG; + } + + version = *datap++; + switch (version) { + case 4: + is_v4 = 1; + case 3: + case 2: + break; + default: + printk("ksign: signature packet with unknown version %d\n", + version); + return 0; + } + + /* store information */ + sig = kzalloc(sizeof(*sig), GFP_KERNEL); + if (!sig) + return -ENOMEM; + + sig->version = version; + + if (!is_v4) + datap++; /* ignore md5 length */ + + sig->sig_class = *datap++; + if (!is_v4) { + sig->timestamp = read_32(&datap); + sig->keyid[0] = read_32(&datap); + sig->keyid[1] = read_32(&datap); + sig->have |= KSIGN_HAVE_KEYID | KSIGN_HAVE_TIMESTAMP; + } + + rc = 0; + if (*datap++ != PUBKEY_ALGO_DSA) { + printk("ksign: ignoring non-DSA signature\n"); + goto leave; + } + + sig->digest_algo = *datap++; + if (sig->digest_algo <= 0 || sig->digest_algo >= DIGEST_ALGO__LAST || + !ksign_hash_algo_names[sig->digest_algo]) { + printk("ksign: ignoring unsupported signature hash (%u)\n", + sig->digest_algo); + goto leave; + } + + rc = -EBADMSG; + if (is_v4) { + /* read subpackets */ + n = read_16(&datap); /* length of hashed data */ + if (n > 10000) { + printk("ksign: signature packet:" + " hashed data too long\n"); + goto leave; + } + if (n) { + if ((size_t)(endp - datap) < n) { + printk("ksign: signature packet:" + " available data too short\n"); + goto leave; + } + sig->hashed_data = kmalloc(n + 2, GFP_KERNEL); + if (!sig->hashed_data) { + rc = -ENOMEM; + goto leave; + } + sig->hashed_data[0] = n >> 8; + sig->hashed_data[1] = n; + memcpy(sig->hashed_data + 2, datap, n); + + rc = ksign_parse_v4_subpackets(sig, datap, n); + if (rc < 0) + goto leave; + datap += n; + } + + n = read_16(&datap); /* length of unhashed data */ + if (n > 10000) { + printk("ksign: signature packet:" + " unhashed data too long\n"); + goto leave; + } + if (n) { + if ((size_t) (endp - datap) < n) { + printk("ksign: signature packet:" + " available data too short\n"); + goto leave; + } + sig->unhashed_data = kmalloc(n + 2, GFP_KERNEL); + if (!sig->unhashed_data) { + rc = -ENOMEM; + goto leave; + } + sig->unhashed_data[0] = n >> 8; + sig->unhashed_data[1] = n; + memcpy(sig->unhashed_data + 2, datap, n); + + rc = ksign_parse_v4_subpackets(sig, datap, n); + if (rc < 0) + goto leave; + datap += n; + } + } + + if (endp - datap < 5) { /* sanity check */ + printk("ksign: signature packet too short\n"); + goto leave; + } + + sig->digest_start[0] = *datap++; + sig->digest_start[1] = *datap++; + + if (!(sig->have & KSIGN_HAVE_KEYID)) + printk("ksign: signature packet without issuer\n"); + + for (i = 0; i < DSA_NSIG; i++) { + unsigned remaining = endp - datap; + sig->data[i] = mpi_read_from_buffer(datap, &remaining); + datap += remaining; + } + + rc = 0; + if (sigfnx) { + rc = sigfnx(sig, fnxdata); + if (rc == 0) + return rc; /* sigfnx keeps the signature */ + if (rc == 1) + rc = 0; + } + +leave: + ksign_free_signature(sig); + return rc; +} + +/* + * parse the next packet and call appropriate handler function for known types + * - returns: + * 0 on EOF + * 1 if there might be more packets + * -EBADMSG if the packet is in an invalid format + * -ve on other error + */ +static int ksign_parse_one_packet(const uint8_t **datap, + const uint8_t *endp, + ksign_signature_actor_t sigfnx, + ksign_public_key_actor_t pkfnx, + ksign_user_id_actor_t uidfnx, + void *data) +{ + int rc, c, ctb, pkttype, lenbytes; + unsigned long pktlen; + uint8_t hdr[8]; + int hdrlen; + + /* extract the next packet and dispatch it */ + rc = 0; + if (*datap >= endp) + goto leave; + ctb = *(*datap)++; + + rc = -EBADMSG; + + hdrlen = 0; + hdr[hdrlen++] = ctb; + if (!(ctb & 0x80)) { + printk("ksign: invalid packet (ctb=%02x)\n", ctb); + goto leave; + } + + pktlen = 0; + if (ctb & 0x40) { + pkttype = ctb & 0x3f; + if (*datap >= endp) { + printk("ksign: 1st length byte missing\n"); + goto leave; + } + c = *(*datap)++; + hdr[hdrlen++] = c; + + if (c < 192) { + pktlen = c; + } else if (c < 224) { + pktlen = (c - 192) * 256; + if (*datap >= endp) { + printk("ksign: 2nd length byte missing\n"); + goto leave; + } + c = *(*datap)++; + hdr[hdrlen++] = c; + pktlen += c + 192; + } else if (c == 255) { + if (*datap + 3 >= endp) { + printk("ksign: 4 uint8_t length invalid\n"); + goto leave; + } + pktlen = (hdr[hdrlen++] = *(*datap)++ << 24); + pktlen |= (hdr[hdrlen++] = *(*datap)++ << 16); + pktlen |= (hdr[hdrlen++] = *(*datap)++ << 8); + pktlen |= (hdr[hdrlen++] = *(*datap)++ << 0); + } else { + pktlen = 0;/* to indicate partial length */ + } + } else { + pkttype = (ctb >> 2) & 0xf; + lenbytes = ((ctb & 3) == 3) ? 0 : (1 << (ctb & 3)); + if( !lenbytes ) { + pktlen = 0; /* don't know the value */ + } else { + if (*datap + lenbytes > endp) { + printk("ksign: length bytes missing\n"); + goto leave; + } + for( ; lenbytes; lenbytes-- ) { + pktlen <<= 8; + pktlen |= hdr[hdrlen++] = *(*datap)++; + } + } + } + + if (*datap + pktlen > endp) { + printk("ksign: packet length longer than available data\n"); + goto leave; + } + + /* deal with the next packet appropriately */ + switch (pkttype) { + case PKT_PUBLIC_KEY: + rc = ksign_parse_key(*datap, *datap + pktlen, hdr, hdrlen, + pkfnx, data); + break; + case PKT_SIGNATURE: + rc = ksign_parse_signature(*datap, *datap + pktlen, + sigfnx, data); + break; + case PKT_USER_ID: + rc = ksign_parse_user_id(*datap, *datap + pktlen, + uidfnx, data); + break; + default: + rc = 0; /* unknown packet */ + break; + } + + *datap += pktlen; +leave: + return rc; +} + +/* + * parse the contents of a packet buffer, passing the signature, public key and + * user ID to the caller's callback functions + */ +int ksign_parse_packets(const uint8_t *buf, + size_t size, + ksign_signature_actor_t sigfnx, + ksign_public_key_actor_t pkfnx, + ksign_user_id_actor_t uidfnx, + void *data) +{ + const uint8_t *datap, *endp; + int rc; + + datap = buf; + endp = buf + size; + do { + rc = ksign_parse_one_packet(&datap, endp, + sigfnx, pkfnx, uidfnx, data); + } while (rc == 0 && datap < endp); + + return rc; +} diff -uprN linux-2.6.32/crypto/signature/ksign-publickey.c linux-2.6.32.ovz/crypto/signature/ksign-publickey.c --- linux-2.6.32/crypto/signature/ksign-publickey.c 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/signature/ksign-publickey.c 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,18 @@ +#include "local.h" +#include "key.h" + +static int __init ksign_init(void) +{ + int rc; + + printk("ksign: Installing public key data\n"); + + rc = ksign_load_keyring_from_buffer(ksign_def_public_key, + ksign_def_public_key_size); + if (rc < 0) + printk("Unable to load default keyring: error=%d\n", -rc); + + return rc; +} + +module_init(ksign_init) diff -uprN linux-2.6.32/crypto/signature/local.h linux-2.6.32.ovz/crypto/signature/local.h --- linux-2.6.32/crypto/signature/local.h 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/signature/local.h 2015-03-10 13:24:07.000000000 -0700 @@ -0,0 +1,176 @@ +/* local.h: kernel signature checker internal defs + * + * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * - Derived from GnuPG packet.h - packet definitions + * - Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. + * + * GnuPG is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GnuPG is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include +#include +#include +#include +#include +#include + +#define SHA1_DIGEST_SIZE (160 / 8) +#define SHA_MAX_DIGEST_SIZE (512 / 8) + +#define PUBKEY_USAGE_SIG 1 /* key is good for signatures */ +#define PUBKEY_USAGE_ENC 2 /* key is good for encryption */ + +#define PUBKEY_ALGO_DSA 17 +#define DSA_NPKEY 4 /* number of MPI's in DSA public key */ +#define DSA_NSIG 2 /* number of MPI's in DSA signature */ + +enum digest_algo { + DIGEST_ALGO_SHA1 = 2, + DIGEST_ALGO_SHA256 = 8, + DIGEST_ALGO_SHA384 = 9, + DIGEST_ALGO_SHA512 = 10, + DIGEST_ALGO_SHA224 = 11, + DIGEST_ALGO__LAST +}; + +extern const char ksign_hash_algo_names[DIGEST_ALGO__LAST][7]; +extern const u16 ksign_hash_algo_sizes[DIGEST_ALGO__LAST]; + +typedef enum { + PKT_NONE = 0, + PKT_SIGNATURE = 2, /* secret key encrypted packet */ + PKT_PUBLIC_KEY = 6, /* public key */ + PKT_USER_ID = 13, /* user id packet */ +} pkttype_t; + +typedef enum { + SIGSUBPKT_TEST_CRITICAL = -3, + SIGSUBPKT_NONE = 0, + SIGSUBPKT_SIG_CREATED = 2, /* signature creation time */ + SIGSUBPKT_SIG_EXPIRE = 3, /* signature expiration time */ + SIGSUBPKT_EXPORTABLE = 4, /* exportable */ + SIGSUBPKT_TRUST = 5, /* trust signature */ + SIGSUBPKT_REGEXP = 6, /* regular expression */ + SIGSUBPKT_REVOCABLE = 7, /* revocable */ + SIGSUBPKT_KEY_EXPIRE = 9, /* key expiration time */ + SIGSUBPKT_ARR = 10, /* additional recipient request */ + SIGSUBPKT_PREF_SYM = 11, /* preferred symmetric algorithms */ + SIGSUBPKT_REV_KEY = 12, /* revocation key */ + SIGSUBPKT_ISSUER = 16, /* issuer key ID */ + SIGSUBPKT_NOTATION = 20, /* notation data */ + SIGSUBPKT_PREF_HASH = 21, /* preferred hash algorithms */ + SIGSUBPKT_PREF_COMPR = 22, /* preferred compression algorithms */ + SIGSUBPKT_KS_FLAGS = 23, /* key server preferences */ + SIGSUBPKT_PREF_KS = 24, /* preferred key server */ + SIGSUBPKT_PRIMARY_UID = 25, /* primary user id */ + SIGSUBPKT_POLICY = 26, /* policy URL */ + SIGSUBPKT_KEY_FLAGS = 27, /* key flags */ + SIGSUBPKT_SIGNERS_UID = 28, /* signer's user id */ + SIGSUBPKT_REVOC_REASON = 29, /* reason for revocation */ + SIGSUBPKT_PRIV_VERIFY_CACHE = 101, /* cache verification result */ + + SIGSUBPKT_FLAG_CRITICAL = 128 +} sigsubpkttype_t; + +/* + * signature record + */ +struct ksign_signature { + uint32_t have; /* list of bits found */ +#define KSIGN_HAVE_KEYID 0x01 +#define KSIGN_HAVE_TIMESTAMP 0x02 + uint32_t keyid[2]; /* 64 bit keyid */ + time_t timestamp; /* signature made */ + uint8_t version; + uint8_t sig_class; /* sig classification, append for MD calculation*/ + enum digest_algo digest_algo : 8; /* digest algorithm */ + uint8_t *hashed_data; /* all subpackets with hashed data (v4 only) */ + uint8_t *unhashed_data; /* ditto for unhashed data */ + uint8_t digest_start[2]; /* first 2 uint8_ts of the digest */ + MPI data[DSA_NSIG]; +}; + +extern void ksign_free_signature(struct ksign_signature *sig); + +/* + * public key record + */ +struct ksign_public_key { + struct list_head link; + atomic_t count; /* ref count */ + time_t timestamp; /* key made */ + time_t expiredate; /* expires at this date or 0 if not at all */ + uint8_t hdrbytes; /* number of header bytes */ + uint8_t version; + int is_valid; /* key (especially subkey) is valid */ + unsigned long local_id; /* internal use, valid if > 0 */ + uint32_t main_keyid[2]; /* keyid of the primary key */ + uint32_t keyid[2]; /* calculated by keyid_from_pk() */ + MPI pkey[DSA_NPKEY]; +}; + +extern void ksign_free_public_key(struct ksign_public_key *pk); + +static inline void ksign_put_public_key(struct ksign_public_key *pk) +{ + if (atomic_dec_and_test(&pk->count)) + ksign_free_public_key(pk); +} + +extern int ksign_load_keyring_from_buffer(const void *buffer, size_t size); + +extern struct ksign_public_key *ksign_get_public_key(const uint32_t *keyid); + +/* + * user ID record + */ +struct ksign_user_id { + int len; /* length of the name */ + char name[0]; +}; + +extern void ksign_free_user_id(struct ksign_user_id *uid); + +/* + * + */ +typedef int (*ksign_signature_actor_t)(struct ksign_signature *, void *fnxdata); +typedef int (*ksign_public_key_actor_t)(struct ksign_public_key *, void *fnxdata); +typedef int (*ksign_user_id_actor_t)(struct ksign_user_id *, void *fnxdata); + +extern int ksign_parse_packets(const uint8_t *buf, + size_t size, + ksign_signature_actor_t sigfnx, + ksign_public_key_actor_t pkfnx, + ksign_user_id_actor_t uidfnx, + void *data); + +extern int DSA_verify(const MPI datahash, const MPI sig[], const MPI pkey[]); + +/* + * fast access to the digest + * - we _know_ the data is locked into kernel memory, so we don't want to have + * to kmap() it + */ +static inline void digest_putc(struct shash_desc *digest, uint8_t ch) +{ + crypto_shash_update(digest, &ch, 1); +} + +static inline void digest_write(struct shash_desc *digest, const void *s, size_t n) +{ + crypto_shash_update(digest, s, n); +} diff -uprN linux-2.6.32/crypto/signature/Makefile linux-2.6.32.ovz/crypto/signature/Makefile --- linux-2.6.32/crypto/signature/Makefile 1969-12-31 16:00:00.000000000 -0800 +++ linux-2.6.32.ovz/crypto/signature/Makefile 2015-03-10 13:20:58.000000000 -0700 @@ -0,0 +1,10 @@ +# +# Makefile for the signature checker +# + +obj-y := \ + ksign.o \ + ksign-parse.o \ + ksign-keyring.o \ + ksign-publickey.o \ + dsa.o diff -uprN linux-2.6.32/crypto/tcrypt.c linux-2.6.32.ovz/crypto/tcrypt.c --- linux-2.6.32/crypto/tcrypt.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/tcrypt.c 2015-03-10 13:24:15.000000000 -0700 @@ -8,6 +8,13 @@ * Copyright (c) 2002 Jean-Francois Dive * Copyright (c) 2007 Nokia Siemens Networks * + * Updated RFC4106 AES-GCM testing. + * Authors: Aidan O'Mahony (aidan.o.mahony@intel.com) + * Adrian Hoban + * Gabriele Paoloni + * Tadeusz Struk (tadeusz.struk@intel.com) + * Copyright (c) 2010, Intel Corporation. + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) @@ -683,6 +690,10 @@ static int do_test(int m) ret += tcrypt_test("rfc4309(ccm(aes))"); break; + case 47: + ret += tcrypt_test("crct10dif"); + break; + case 100: ret += tcrypt_test("hmac(md5)"); break; @@ -727,6 +738,10 @@ static int do_test(int m) ret += tcrypt_test("ansi_cprng"); break; + case 151: + ret += tcrypt_test("rfc4106(gcm(aes))"); + break; + case 200: test_cipher_speed("ecb(aes)", ENCRYPT, sec, NULL, 0, speed_template_16_24_32); @@ -881,6 +896,14 @@ static int do_test(int m) test_hash_speed("rmd320", sec, generic_hash_speed_template); if (mode > 300 && mode < 400) break; + case 320: + test_hash_speed("crct10dif", sec, generic_hash_speed_template); + if (mode > 300 && mode < 400) break; + + case 319: + test_hash_speed("crc32c", sec, generic_hash_speed_template); + if (mode > 300 && mode < 400) break; + case 399: break; diff -uprN linux-2.6.32/crypto/testmgr.c linux-2.6.32.ovz/crypto/testmgr.c --- linux-2.6.32/crypto/testmgr.c 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/testmgr.c 2015-03-10 13:24:15.000000000 -0700 @@ -6,6 +6,13 @@ * Copyright (c) 2007 Nokia Siemens Networks * Copyright (c) 2008 Herbert Xu * + * Updated RFC4106 AES-GCM testing. + * Authors: Aidan O'Mahony (aidan.o.mahony@intel.com) + * Adrian Hoban + * Gabriele Paoloni + * Tadeusz Struk (tadeusz.struk@intel.com) + * Copyright (c) 2010, Intel Corporation. + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) @@ -20,6 +27,7 @@ #include #include #include +#include #include "internal.h" #include "testmgr.h" @@ -90,6 +98,11 @@ struct cprng_test_suite { unsigned int count; }; +struct drbg_test_suite { + struct drbg_testvec *vecs; + unsigned int count; +}; + struct alg_test_desc { const char *alg; int (*test)(const struct alg_test_desc *desc, const char *driver, @@ -103,6 +116,7 @@ struct alg_test_desc { struct pcomp_test_suite pcomp; struct hash_test_suite hash; struct cprng_test_suite cprng; + struct drbg_test_suite drbg; } suite; }; @@ -1201,7 +1215,7 @@ static int test_cprng(struct crypto_rng unsigned int tcount) { const char *algo = crypto_tfm_alg_driver_name(crypto_rng_tfm(tfm)); - int err, i, j, seedsize; + int err = 0, i, j, seedsize; u8 *seed; char result[32]; @@ -1477,9 +1491,151 @@ static int alg_test_cprng(const struct a return err; } + +static int drbg_cavs_test(struct drbg_testvec *test, int pr, + const char *driver, u32 type, u32 mask) +{ + int ret = -EAGAIN; + struct crypto_rng *drng; + struct drbg_test_data test_data; + struct drbg_string addtl, pers, testentropy; + unsigned char *buf = kzalloc(test->expectedlen, GFP_KERNEL); + + if (!buf) + return -ENOMEM; + + drng = crypto_alloc_rng(driver, type, mask); + if (IS_ERR(drng)) { + printk(KERN_ERR "alg: drbg: could not allocate DRNG handle for" + "%s\n", driver); + kzfree(buf); + return -ENOMEM; + } + + test_data.testentropy = &testentropy; + drbg_string_fill(&testentropy, test->entropy, test->entropylen); + drbg_string_fill(&pers, test->pers, test->perslen); + ret = crypto_drbg_reset_test(drng, &pers, &test_data); + if (ret) { + printk(KERN_ERR "alg: drbg: Failed to reset rng\n"); + goto outbuf; + } + + drbg_string_fill(&addtl, test->addtla, test->addtllen); + if (pr) { + drbg_string_fill(&testentropy, test->entpra, test->entprlen); + ret = crypto_drbg_get_bytes_addtl_test(drng, + buf, test->expectedlen, &addtl, &test_data); + } else { + ret = crypto_drbg_get_bytes_addtl(drng, + buf, test->expectedlen, &addtl); + } + if (ret <= 0) { + printk(KERN_ERR "alg: drbg: could not obtain random data for" + "driver %s\n", driver); + goto outbuf; + } + + drbg_string_fill(&addtl, test->addtlb, test->addtllen); + if (pr) { + drbg_string_fill(&testentropy, test->entprb, test->entprlen); + ret = crypto_drbg_get_bytes_addtl_test(drng, + buf, test->expectedlen, &addtl, &test_data); + } else { + ret = crypto_drbg_get_bytes_addtl(drng, + buf, test->expectedlen, &addtl); + } + if (ret <= 0) { + printk(KERN_ERR "alg: drbg: could not obtain random data for" + "driver %s\n", driver); + goto outbuf; + } + + ret = memcmp(test->expected, buf, test->expectedlen); + +outbuf: + crypto_free_rng(drng); + kzfree(buf); + return ret; +} + + +static int alg_test_drbg(const struct alg_test_desc *desc, const char *driver, + u32 type, u32 mask) +{ + int err = 0; + int pr = 0; + int i = 0; + struct drbg_testvec *template = desc->suite.drbg.vecs; + unsigned int tcount = desc->suite.drbg.count; + + if (0 == memcmp(driver, "drbg_pr_", 8)) + pr = 1; + + for (i = 0; i < tcount; i++) { + err = drbg_cavs_test(&template[i], pr, driver, type, mask); + if (err) { + printk(KERN_ERR "alg: drbg: Test %d failed for %s\n", + i, driver); + err = -EINVAL; + break; + } + } + return err; + +} + +static int alg_test_null(const struct alg_test_desc *desc, + const char *driver, u32 type, u32 mask) +{ + return 0; +} + /* Please keep this list sorted by algorithm name. */ static const struct alg_test_desc alg_test_descs[] = { { + .alg = "__driver-cbc-aes-aesni", + .test = alg_test_null, + .fips_allowed = 1, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } + }, { + .alg = "__driver-ecb-aes-aesni", + .test = alg_test_null, + .fips_allowed = 1, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } + }, { + .alg = "__ghash-pclmulqdqni", + .test = alg_test_null, + .fips_allowed = 1, + .suite = { + .hash = { + .vecs = NULL, + .count = 0 + } + } + }, { .alg = "ansi_cprng", .test = alg_test_cprng, .fips_allowed = 1, @@ -1623,6 +1779,74 @@ static const struct alg_test_desc alg_te } } }, { + .alg = "crct10dif", + .test = alg_test_hash, + .fips_allowed = 1, + .suite = { + .hash = { + .vecs = crct10dif_tv_template, + .count = CRCT10DIF_TEST_VECTORS + } + } + }, { + .alg = "cryptd(__driver-cbc-aes-aesni)", + .test = alg_test_null, + .fips_allowed = 1, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } + }, { + .alg = "cryptd(__driver-ecb-aes-aesni)", + .test = alg_test_null, + .fips_allowed = 1, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } + }, { + .alg = "cryptd(__driver-gcm-aes-aesni)", + .test = alg_test_null, + .fips_allowed = 1, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } + }, { + .alg = "cryptd(__ghash-pclmulqdqni)", + .test = alg_test_null, + .fips_allowed = 1, + .suite = { + .hash = { + .vecs = NULL, + .count = 0 + } + } + }, { .alg = "ctr(aes)", .test = alg_test_skcipher, .fips_allowed = 1, @@ -1669,6 +1893,168 @@ static const struct alg_test_desc alg_te } } }, { + .alg = "drbg_nopr_ctr_aes128", + .test = alg_test_drbg, + .fips_allowed = 1, + .suite = { + .drbg = { + .vecs = drbg_nopr_ctr_aes128_tv_template, + .count = ARRAY_SIZE(drbg_nopr_ctr_aes128_tv_template) + } + } + }, { + .alg = "drbg_nopr_ctr_aes192", + .test = alg_test_drbg, + .fips_allowed = 1, + .suite = { + .drbg = { + .vecs = drbg_nopr_ctr_aes192_tv_template, + .count = ARRAY_SIZE(drbg_nopr_ctr_aes192_tv_template) + } + } + }, { + .alg = "drbg_nopr_ctr_aes256", + .test = alg_test_drbg, + .fips_allowed = 1, + .suite = { + .drbg = { + .vecs = drbg_nopr_ctr_aes256_tv_template, + .count = ARRAY_SIZE(drbg_nopr_ctr_aes256_tv_template) + } + } + }, { + /* + * There is no need to specifically test the DRBG with every + * backend cipher -- covered by drbg_nopr_hmac_sha256 test + */ + .alg = "drbg_nopr_hmac_sha1", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_nopr_hmac_sha256", + .test = alg_test_drbg, + .fips_allowed = 1, + .suite = { + .drbg = { + .vecs = drbg_nopr_hmac_sha256_tv_template, + .count = + ARRAY_SIZE(drbg_nopr_hmac_sha256_tv_template) + } + } + }, { + /* covered by drbg_nopr_hmac_sha256 test */ + .alg = "drbg_nopr_hmac_sha384", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_nopr_hmac_sha512", + .test = alg_test_null, + .fips_allowed = 1, + }, { + .alg = "drbg_nopr_sha1", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_nopr_sha256", + .test = alg_test_drbg, + .fips_allowed = 1, + .suite = { + .drbg = { + .vecs = drbg_nopr_sha256_tv_template, + .count = ARRAY_SIZE(drbg_nopr_sha256_tv_template) + } + } + }, { + /* covered by drbg_nopr_sha256 test */ + .alg = "drbg_nopr_sha384", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_nopr_sha512", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_pr_ctr_aes128", + .test = alg_test_drbg, + .fips_allowed = 1, + .suite = { + .drbg = { + .vecs = drbg_pr_ctr_aes128_tv_template, + .count = ARRAY_SIZE(drbg_pr_ctr_aes128_tv_template) + } + } + }, { + /* covered by drbg_pr_ctr_aes128 test */ + .alg = "drbg_pr_ctr_aes192", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_pr_ctr_aes256", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_pr_hmac_sha1", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_pr_hmac_sha256", + .test = alg_test_drbg, + .fips_allowed = 1, + .suite = { + .drbg = { + .vecs = drbg_pr_hmac_sha256_tv_template, + .count = ARRAY_SIZE(drbg_pr_hmac_sha256_tv_template) + } + } + }, { + /* covered by drbg_pr_hmac_sha256 test */ + .alg = "drbg_pr_hmac_sha384", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_pr_hmac_sha512", + .test = alg_test_null, + .fips_allowed = 1, + }, { + .alg = "drbg_pr_sha1", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_pr_sha256", + .test = alg_test_drbg, + .fips_allowed = 1, + .suite = { + .drbg = { + .vecs = drbg_pr_sha256_tv_template, + .count = ARRAY_SIZE(drbg_pr_sha256_tv_template) + } + } + }, { + /* covered by drbg_pr_sha256 test */ + .alg = "drbg_pr_sha384", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "drbg_pr_sha512", + .fips_allowed = 1, + .test = alg_test_null, + }, { + .alg = "ecb(__aes-aesni)", + .test = alg_test_null, + .fips_allowed = 1, + .suite = { + .cipher = { + .enc = { + .vecs = NULL, + .count = 0 + }, + .dec = { + .vecs = NULL, + .count = 0 + } + } + } + }, { .alg = "ecb(aes)", .test = alg_test_skcipher, .fips_allowed = 1, @@ -1943,6 +2329,16 @@ static const struct alg_test_desc alg_te } } }, { + .alg = "ghash", + .test = alg_test_hash, + .fips_allowed = 1, + .suite = { + .hash = { + .vecs = ghash_tv_template, + .count = GHASH_TEST_VECTORS + } + } + }, { .alg = "hmac(md5)", .test = alg_test_hash, .suite = { @@ -2108,6 +2504,24 @@ static const struct alg_test_desc alg_te } } }, { + .alg = "rfc4106(gcm(aes))", + .test = alg_test_aead, + .fips_allowed = 1, + .suite = { + .aead = { + .enc = { + .vecs = aes_gcm_rfc4106_enc_tv_template, + .count = AES_GCM_4106_ENC_TEST_VECTORS + }, + .dec = { + .vecs = aes_gcm_rfc4106_dec_tv_template, + .count = AES_GCM_4106_DEC_TEST_VECTORS + } + } + } + }, { + + .alg = "rfc4309(ccm(aes))", .test = alg_test_aead, .fips_allowed = 1, @@ -2295,6 +2709,7 @@ static const struct alg_test_desc alg_te }, { .alg = "xts(aes)", .test = alg_test_skcipher, + .fips_allowed = 1, .suite = { .cipher = { .enc = { diff -uprN linux-2.6.32/crypto/testmgr.h linux-2.6.32.ovz/crypto/testmgr.h --- linux-2.6.32/crypto/testmgr.h 2009-12-02 19:51:21.000000000 -0800 +++ linux-2.6.32.ovz/crypto/testmgr.h 2015-03-10 13:24:15.000000000 -0700 @@ -6,6 +6,15 @@ * Copyright (c) 2007 Nokia Siemens Networks * Copyright (c) 2008 Herbert Xu * + * Updated RFC4106 AES-GCM testing. Some test vectors were taken from + * http://csrc.nist.gov/groups/ST/toolkit/BCM/documents/proposedmodes/ + * gcm/gcm-test-vectors.tar.gz + * Authors: Aidan O'Mahony (aidan.o.mahony@intel.com) + * Adrian Hoban + * Gabriele Paoloni + * Tadeusz Struk (tadeusz.struk@intel.com) + * Copyright (c) 2010, Intel Corporation. + * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) @@ -32,7 +41,7 @@ struct hash_testvec { char *plaintext; char *digest; unsigned char tap[MAX_TAP]; - unsigned char psize; + unsigned short psize; unsigned char np; unsigned char ksize; }; @@ -82,6 +91,21 @@ struct cprng_testvec { unsigned short loops; }; +struct drbg_testvec { + unsigned char *entropy; + size_t entropylen; + unsigned char *entpra; + unsigned char *entprb; + size_t entprlen; + unsigned char *addtla; + unsigned char *addtlb; + size_t addtllen; + unsigned char *pers; + size_t perslen; + unsigned char *expected; + size_t expectedlen; +}; + static char zeroed_string[48]; /* @@ -440,6 +464,39 @@ static struct hash_testvec rmd320_tv_tem } }; +#define CRCT10DIF_TEST_VECTORS 3 +static struct hash_testvec crct10dif_tv_template[] = { + { + .plaintext = "abc", + .psize = 3, +#ifdef __LITTLE_ENDIAN + .digest = "\x3b\x44", +#else + .digest = "\x44\x3b", +#endif + }, { + .plaintext = "1234567890123456789012345678901234567890" + "123456789012345678901234567890123456789", + .psize = 79, +#ifdef __LITTLE_ENDIAN + .digest = "\x70\x4b", +#else + .digest = "\x4b\x70", +#endif + }, { + .plaintext = + "abcddddddddddddddddddddddddddddddddddddddddddddddddddddd", + .psize = 56, +#ifdef __LITTLE_ENDIAN + .digest = "\xe3\x9c", +#else + .digest = "\x9c\xe3", +#endif + .np = 2, + .tap = { 28, 28 } + } +}; + /* * SHA1 test vectors from from FIPS PUB 180-1 */ @@ -1003,6 +1060,21 @@ static struct hash_testvec tgr128_tv_tem }, }; +#define GHASH_TEST_VECTORS 1 + +static struct hash_testvec ghash_tv_template[] = +{ + { + + .key = "\xdf\xa6\xbf\x4d\xed\x81\xdb\x03\xff\xca\xff\x95\xf8\x30\xf0\x61", + .ksize = 16, + .plaintext = "\x95\x2b\x2a\x56\xa5\x60\x04a\xc0\xb3\x2b\x66\x56\xa0\x5b\x40\xb6", + .psize = 16, + .digest = "\xda\x53\xeb\x0a\xd2\xc5\x5b\xb6" + "\x4f\xc4\x80\x2c\xc3\xfe\xda\x60", + }, +}; + /* * HMAC-MD5 test vectors from RFC2202 * (These need to be fixed to not use strlen). @@ -1654,17 +1726,73 @@ static struct hash_testvec aes_xcbc128_t } }; -#define VMAC_AES_TEST_VECTORS 1 -static char vmac_string[128] = {'\x01', '\x01', '\x01', '\x01', +#define VMAC_AES_TEST_VECTORS 8 +static char vmac_string1[128] = {'\x01', '\x01', '\x01', '\x01', '\x02', '\x03', '\x02', '\x02', '\x02', '\x04', '\x01', '\x07', '\x04', '\x01', '\x04', '\x03',}; +static char vmac_string2[128] = {'a', 'b', 'c',}; +static char vmac_string3[128] = {'a', 'b', 'c', 'a', 'b', 'c', + 'a', 'b', 'c', 'a', 'b', 'c', + 'a', 'b', 'c', 'a', 'b', 'c', + 'a', 'b', 'c', 'a', 'b', 'c', + 'a', 'b', 'c', 'a', 'b', 'c', + 'a', 'b', 'c', 'a', 'b', 'c', + 'a', 'b', 'c', 'a', 'b', 'c', + 'a', 'b', 'c', 'a', 'b', 'c', + }; + static struct hash_testvec aes_vmac128_tv_template[] = { { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = NULL, + .digest = "\x07\x58\x80\x35\x77\xa4\x7b\x54", + .psize = 0, + .ksize = 16, + }, { .key = "\x00\x01\x02\x03\x04\x05\x06\x07" "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", - .plaintext = vmac_string, - .digest = "\xcb\xd7\x8a\xfd\xb7\x33\x79\xe7", + .plaintext = vmac_string1, + .digest = "\xce\xf5\x3c\xd3\xae\x68\x8c\xa1", + .psize = 128, + .ksize = 16, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = vmac_string2, + .digest = "\xc9\x27\xb0\x73\x81\xbd\x14\x2d", + .psize = 128, + .ksize = 16, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + .plaintext = vmac_string3, + .digest = "\x8d\x1a\x95\x8c\x98\x47\x0b\x19", + .psize = 128, + .ksize = 16, + }, { + .key = "abcdefghijklmnop", + .plaintext = NULL, + .digest = "\x3b\x89\xa1\x26\x9e\x55\x8f\x84", + .psize = 0, + .ksize = 16, + }, { + .key = "abcdefghijklmnop", + .plaintext = vmac_string1, + .digest = "\xab\x5e\xab\xb0\xf6\x8d\x74\xc2", + .psize = 128, + .ksize = 16, + }, { + .key = "abcdefghijklmnop", + .plaintext = vmac_string2, + .digest = "\x11\x15\x68\x42\x3d\x7b\x09\xdf", + .psize = 128, + .ksize = 16, + }, { + .key = "abcdefghijklmnop", + .plaintext = vmac_string3, + .digest = "\x8b\x32\x8f\xe1\xed\x8f\xfa\xd4", .psize = 128, .ksize = 16, }, @@ -2868,14 +2996,16 @@ static struct cipher_testvec cast6_dec_t #define AES_CBC_DEC_TEST_VECTORS 4 #define AES_LRW_ENC_TEST_VECTORS 8 #define AES_LRW_DEC_TEST_VECTORS 8 -#define AES_XTS_ENC_TEST_VECTORS 4 -#define AES_XTS_DEC_TEST_VECTORS 4 +#define AES_XTS_ENC_TEST_VECTORS 5 +#define AES_XTS_DEC_TEST_VECTORS 5 #define AES_CTR_ENC_TEST_VECTORS 3 #define AES_CTR_DEC_TEST_VECTORS 3 #define AES_CTR_3686_ENC_TEST_VECTORS 7 #define AES_CTR_3686_DEC_TEST_VECTORS 6 #define AES_GCM_ENC_TEST_VECTORS 9 #define AES_GCM_DEC_TEST_VECTORS 8 +#define AES_GCM_4106_ENC_TEST_VECTORS 7 +#define AES_GCM_4106_DEC_TEST_VECTORS 7 #define AES_CCM_ENC_TEST_VECTORS 7 #define AES_CCM_DEC_TEST_VECTORS 7 #define AES_CCM_4309_ENC_TEST_VECTORS 7 @@ -3814,6 +3944,150 @@ static struct cipher_testvec aes_xts_enc "\x0a\x28\x2d\xf9\x20\x14\x7b\xea" "\xbe\x42\x1e\xe5\x31\x9d\x05\x68", .rlen = 512, + }, { /* XTS-AES 10, XTS-AES-256, data unit 512 bytes */ + .key = "\x27\x18\x28\x18\x28\x45\x90\x45" + "\x23\x53\x60\x28\x74\x71\x35\x26" + "\x62\x49\x77\x57\x24\x70\x93\x69" + "\x99\x59\x57\x49\x66\x96\x76\x27" + "\x31\x41\x59\x26\x53\x58\x97\x93" + "\x23\x84\x62\x64\x33\x83\x27\x95" + "\x02\x88\x41\x97\x16\x93\x99\x37" + "\x51\x05\x82\x09\x74\x94\x45\x92", + .klen = 64, + .iv = "\xff\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + .input = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27" + "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x57" + "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f" + "\x60\x61\x62\x63\x64\x65\x66\x67" + "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77" + "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" + "\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" + "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" + "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7" + "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7" + "\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" + "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27" + "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x57" + "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f" + "\x60\x61\x62\x63\x64\x65\x66\x67" + "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77" + "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" + "\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" + "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" + "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7" + "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7" + "\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .ilen = 512, + .result = "\x1c\x3b\x3a\x10\x2f\x77\x03\x86" + "\xe4\x83\x6c\x99\xe3\x70\xcf\x9b" + "\xea\x00\x80\x3f\x5e\x48\x23\x57" + "\xa4\xae\x12\xd4\x14\xa3\xe6\x3b" + "\x5d\x31\xe2\x76\xf8\xfe\x4a\x8d" + "\x66\xb3\x17\xf9\xac\x68\x3f\x44" + "\x68\x0a\x86\xac\x35\xad\xfc\x33" + "\x45\xbe\xfe\xcb\x4b\xb1\x88\xfd" + "\x57\x76\x92\x6c\x49\xa3\x09\x5e" + "\xb1\x08\xfd\x10\x98\xba\xec\x70" + "\xaa\xa6\x69\x99\xa7\x2a\x82\xf2" + "\x7d\x84\x8b\x21\xd4\xa7\x41\xb0" + "\xc5\xcd\x4d\x5f\xff\x9d\xac\x89" + "\xae\xba\x12\x29\x61\xd0\x3a\x75" + "\x71\x23\xe9\x87\x0f\x8a\xcf\x10" + "\x00\x02\x08\x87\x89\x14\x29\xca" + "\x2a\x3e\x7a\x7d\x7d\xf7\xb1\x03" + "\x55\x16\x5c\x8b\x9a\x6d\x0a\x7d" + "\xe8\xb0\x62\xc4\x50\x0d\xc4\xcd" + "\x12\x0c\x0f\x74\x18\xda\xe3\xd0" + "\xb5\x78\x1c\x34\x80\x3f\xa7\x54" + "\x21\xc7\x90\xdf\xe1\xde\x18\x34" + "\xf2\x80\xd7\x66\x7b\x32\x7f\x6c" + "\x8c\xd7\x55\x7e\x12\xac\x3a\x0f" + "\x93\xec\x05\xc5\x2e\x04\x93\xef" + "\x31\xa1\x2d\x3d\x92\x60\xf7\x9a" + "\x28\x9d\x6a\x37\x9b\xc7\x0c\x50" + "\x84\x14\x73\xd1\xa8\xcc\x81\xec" + "\x58\x3e\x96\x45\xe0\x7b\x8d\x96" + "\x70\x65\x5b\xa5\xbb\xcf\xec\xc6" + "\xdc\x39\x66\x38\x0a\xd8\xfe\xcb" + "\x17\xb6\xba\x02\x46\x9a\x02\x0a" + "\x84\xe1\x8e\x8f\x84\x25\x20\x70" + "\xc1\x3e\x9f\x1f\x28\x9b\xe5\x4f" + "\xbc\x48\x14\x57\x77\x8f\x61\x60" + "\x15\xe1\x32\x7a\x02\xb1\x40\xf1" + "\x50\x5e\xb3\x09\x32\x6d\x68\x37" + "\x8f\x83\x74\x59\x5c\x84\x9d\x84" + "\xf4\xc3\x33\xec\x44\x23\x88\x51" + "\x43\xcb\x47\xbd\x71\xc5\xed\xae" + "\x9b\xe6\x9a\x2f\xfe\xce\xb1\xbe" + "\xc9\xde\x24\x4f\xbe\x15\x99\x2b" + "\x11\xb7\x7c\x04\x0f\x12\xbd\x8f" + "\x6a\x97\x5a\x44\xa0\xf9\x0c\x29" + "\xa9\xab\xc3\xd4\xd8\x93\x92\x72" + "\x84\xc5\x87\x54\xcc\xe2\x94\x52" + "\x9f\x86\x14\xdc\xd2\xab\xa9\x91" + "\x92\x5f\xed\xc4\xae\x74\xff\xac" + "\x6e\x33\x3b\x93\xeb\x4a\xff\x04" + "\x79\xda\x9a\x41\x0e\x44\x50\xe0" + "\xdd\x7a\xe4\xc6\xe2\x91\x09\x00" + "\x57\x5d\xa4\x01\xfc\x07\x05\x9f" + "\x64\x5e\x8b\x7e\x9b\xfd\xef\x33" + "\x94\x30\x54\xff\x84\x01\x14\x93" + "\xc2\x7b\x34\x29\xea\xed\xb4\xed" + "\x53\x76\x44\x1a\x77\xed\x43\x85" + "\x1a\xd7\x7f\x16\xf5\x41\xdf\xd2" + "\x69\xd5\x0d\x6a\x5f\x14\xfb\x0a" + "\xab\x1c\xbb\x4c\x15\x50\xbe\x97" + "\xf7\xab\x40\x66\x19\x3c\x4c\xaa" + "\x77\x3d\xad\x38\x01\x4b\xd2\x09" + "\x2f\xa7\x55\xc8\x24\xbb\x5e\x54" + "\xc4\xf3\x6f\xfd\xa9\xfc\xea\x70" + "\xb9\xc6\xe6\x93\xe1\x48\xc1\x51", + .rlen = 512, } }; @@ -4011,6 +4285,151 @@ static struct cipher_testvec aes_xts_dec "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", .rlen = 512, + }, { /* XTS-AES 10, XTS-AES-256, data unit 512 bytes */ + .key = "\x27\x18\x28\x18\x28\x45\x90\x45" + "\x23\x53\x60\x28\x74\x71\x35\x26" + "\x62\x49\x77\x57\x24\x70\x93\x69" + "\x99\x59\x57\x49\x66\x96\x76\x27" + "\x31\x41\x59\x26\x53\x58\x97\x93" + "\x23\x84\x62\x64\x33\x83\x27\x95" + "\x02\x88\x41\x97\x16\x93\x99\x37" + "\x51\x05\x82\x09\x74\x94\x45\x92", + .klen = 64, + .iv = "\xff\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00", + .input = "\x1c\x3b\x3a\x10\x2f\x77\x03\x86" + "\xe4\x83\x6c\x99\xe3\x70\xcf\x9b" + "\xea\x00\x80\x3f\x5e\x48\x23\x57" + "\xa4\xae\x12\xd4\x14\xa3\xe6\x3b" + "\x5d\x31\xe2\x76\xf8\xfe\x4a\x8d" + "\x66\xb3\x17\xf9\xac\x68\x3f\x44" + "\x68\x0a\x86\xac\x35\xad\xfc\x33" + "\x45\xbe\xfe\xcb\x4b\xb1\x88\xfd" + "\x57\x76\x92\x6c\x49\xa3\x09\x5e" + "\xb1\x08\xfd\x10\x98\xba\xec\x70" + "\xaa\xa6\x69\x99\xa7\x2a\x82\xf2" + "\x7d\x84\x8b\x21\xd4\xa7\x41\xb0" + "\xc5\xcd\x4d\x5f\xff\x9d\xac\x89" + "\xae\xba\x12\x29\x61\xd0\x3a\x75" + "\x71\x23\xe9\x87\x0f\x8a\xcf\x10" + "\x00\x02\x08\x87\x89\x14\x29\xca" + "\x2a\x3e\x7a\x7d\x7d\xf7\xb1\x03" + "\x55\x16\x5c\x8b\x9a\x6d\x0a\x7d" + "\xe8\xb0\x62\xc4\x50\x0d\xc4\xcd" + "\x12\x0c\x0f\x74\x18\xda\xe3\xd0" + "\xb5\x78\x1c\x34\x80\x3f\xa7\x54" + "\x21\xc7\x90\xdf\xe1\xde\x18\x34" + "\xf2\x80\xd7\x66\x7b\x32\x7f\x6c" + "\x8c\xd7\x55\x7e\x12\xac\x3a\x0f" + "\x93\xec\x05\xc5\x2e\x04\x93\xef" + "\x31\xa1\x2d\x3d\x92\x60\xf7\x9a" + "\x28\x9d\x6a\x37\x9b\xc7\x0c\x50" + "\x84\x14\x73\xd1\xa8\xcc\x81\xec" + "\x58\x3e\x96\x45\xe0\x7b\x8d\x96" + "\x70\x65\x5b\xa5\xbb\xcf\xec\xc6" + "\xdc\x39\x66\x38\x0a\xd8\xfe\xcb" + "\x17\xb6\xba\x02\x46\x9a\x02\x0a" + "\x84\xe1\x8e\x8f\x84\x25\x20\x70" + "\xc1\x3e\x9f\x1f\x28\x9b\xe5\x4f" + "\xbc\x48\x14\x57\x77\x8f\x61\x60" + "\x15\xe1\x32\x7a\x02\xb1\x40\xf1" + "\x50\x5e\xb3\x09\x32\x6d\x68\x37" + "\x8f\x83\x74\x59\x5c\x84\x9d\x84" + "\xf4\xc3\x33\xec\x44\x23\x88\x51" + "\x43\xcb\x47\xbd\x71\xc5\xed\xae" + "\x9b\xe6\x9a\x2f\xfe\xce\xb1\xbe" + "\xc9\xde\x24\x4f\xbe\x15\x99\x2b" + "\x11\xb7\x7c\x04\x0f\x12\xbd\x8f" + "\x6a\x97\x5a\x44\xa0\xf9\x0c\x29" + "\xa9\xab\xc3\xd4\xd8\x93\x92\x72" + "\x84\xc5\x87\x54\xcc\xe2\x94\x52" + "\x9f\x86\x14\xdc\xd2\xab\xa9\x91" + "\x92\x5f\xed\xc4\xae\x74\xff\xac" + "\x6e\x33\x3b\x93\xeb\x4a\xff\x04" + "\x79\xda\x9a\x41\x0e\x44\x50\xe0" + "\xdd\x7a\xe4\xc6\xe2\x91\x09\x00" + "\x57\x5d\xa4\x01\xfc\x07\x05\x9f" + "\x64\x5e\x8b\x7e\x9b\xfd\xef\x33" + "\x94\x30\x54\xff\x84\x01\x14\x93" + "\xc2\x7b\x34\x29\xea\xed\xb4\xed" + "\x53\x76\x44\x1a\x77\xed\x43\x85" + "\x1a\xd7\x7f\x16\xf5\x41\xdf\xd2" + "\x69\xd5\x0d\x6a\x5f\x14\xfb\x0a" + "\xab\x1c\xbb\x4c\x15\x50\xbe\x97" + "\xf7\xab\x40\x66\x19\x3c\x4c\xaa" + "\x77\x3d\xad\x38\x01\x4b\xd2\x09" + "\x2f\xa7\x55\xc8\x24\xbb\x5e\x54" + "\xc4\xf3\x6f\xfd\xa9\xfc\xea\x70" + "\xb9\xc6\xe6\x93\xe1\x48\xc1\x51", + .ilen = 512, + .result = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27" + "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x57" + "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f" + "\x60\x61\x62\x63\x64\x65\x66\x67" + "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77" + "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" + "\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" + "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" + "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7" + "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7" + "\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" + "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x10\x11\x12\x13\x14\x15\x16\x17" + "\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" + "\x20\x21\x22\x23\x24\x25\x26\x27" + "\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f" + "\x30\x31\x32\x33\x34\x35\x36\x37" + "\x38\x39\x3a\x3b\x3c\x3d\x3e\x3f" + "\x40\x41\x42\x43\x44\x45\x46\x47" + "\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f" + "\x50\x51\x52\x53\x54\x55\x56\x57" + "\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f" + "\x60\x61\x62\x63\x64\x65\x66\x67" + "\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f" + "\x70\x71\x72\x73\x74\x75\x76\x77" + "\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f" + "\x80\x81\x82\x83\x84\x85\x86\x87" + "\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f" + "\x90\x91\x92\x93\x94\x95\x96\x97" + "\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f" + "\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7" + "\xa8\xa9\xaa\xab\xac\xad\xae\xaf" + "\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7" + "\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf" + "\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7" + "\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf" + "\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7" + "\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf" + "\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7" + "\xe8\xe9\xea\xeb\xec\xed\xee\xef" + "\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7" + "\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff", + .rlen = 512, + } }; @@ -5758,6 +6177,356 @@ static struct aead_testvec aes_gcm_dec_t } }; +static struct aead_testvec aes_gcm_rfc4106_enc_tv_template[] = { + { /* Generated using Crypto++ */ + .key = zeroed_string, + .klen = 20, + .iv = zeroed_string, + .input = zeroed_string, + .ilen = 16, + .assoc = zeroed_string, + .alen = 8, + .result = "\x03\x88\xDA\xCE\x60\xB6\xA3\x92" + "\xF3\x28\xC2\xB9\x71\xB2\xFE\x78" + "\x97\xFE\x4C\x23\x37\x42\x01\xE0" + "\x81\x9F\x8D\xC5\xD7\x41\xA0\x1B", + .rlen = 32, + },{ + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x01" + "\x00\x00\x00\x00", + .input = zeroed_string, + .ilen = 16, + .assoc = zeroed_string, + .alen = 8, + .result = "\xC0\x0D\x8B\x42\x0F\x8F\x34\x18" + "\x88\xB1\xC5\xBC\xC5\xB6\xD6\x28" + "\x6A\x9D\xDF\x11\x5E\xFE\x5E\x9D" + "\x2F\x70\x44\x92\xF7\xF2\xE3\xEF", + .rlen = 32, + + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = zeroed_string, + .input = "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01", + .ilen = 16, + .assoc = zeroed_string, + .alen = 8, + .result = "\x4B\xB1\xB5\xE3\x25\x71\x70\xDE" + "\x7F\xC9\x9C\xA5\x14\x19\xF2\xAC" + "\x0B\x8F\x88\x69\x17\xE6\xB4\x3C" + "\xB1\x68\xFD\x14\x52\x64\x61\xB2", + .rlen = 32, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = zeroed_string, + .input = "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01", + .ilen = 16, + .assoc = "\x01\x01\x01\x01\x01\x01\x01\x01", + .alen = 8, + .result = "\x4B\xB1\xB5\xE3\x25\x71\x70\xDE" + "\x7F\xC9\x9C\xA5\x14\x19\xF2\xAC" + "\x90\x92\xB7\xE3\x5F\xA3\x9A\x63" + "\x7E\xD7\x1F\xD8\xD3\x7C\x4B\xF5", + .rlen = 32, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x01" + "\x00\x00\x00\x00", + .input = "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01", + .ilen = 16, + .assoc = "\x01\x01\x01\x01\x01\x01\x01\x01", + .alen = 8, + .result = "\xC1\x0C\x8A\x43\x0E\x8E\x35\x19" + "\x89\xB0\xC4\xBD\xC4\xB7\xD7\x29" + "\x64\x50\xF9\x32\x13\xFB\x74\x61" + "\xF4\xED\x52\xD3\xC5\x10\x55\x3C", + .rlen = 32, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x01" + "\x00\x00\x00\x00", + .input = "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01", + .ilen = 64, + .assoc = "\x01\x01\x01\x01\x01\x01\x01\x01", + .alen = 8, + .result = "\xC1\x0C\x8A\x43\x0E\x8E\x35\x19" + "\x89\xB0\xC4\xBD\xC4\xB7\xD7\x29" + "\x98\x14\xA1\x42\x37\x80\xFD\x90" + "\x68\x12\x01\xA8\x91\x89\xB9\x83" + "\x5B\x11\x77\x12\x9B\xFF\x24\x89" + "\x94\x5F\x18\x12\xBA\x27\x09\x39" + "\x99\x96\x76\x42\x15\x1C\xCD\xCB" + "\xDC\xD3\xDA\x65\x73\xAF\x80\xCD" + "\xD2\xB6\xC2\x4A\x76\xC2\x92\x85" + "\xBD\xCF\x62\x98\x58\x14\xE5\xBD", + .rlen = 80, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x00\x00\x00\x00", + .klen = 20, + .iv = "\x00\x00\x45\x67\x89\xab\xcd\xef" + "\x00\x00\x00\x00", + .input = "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff", + .ilen = 192, + .assoc = "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa" + "\xaa\xaa\xaa\xaa", + .alen = 12, + .result = "\xC1\x76\x33\x85\xE2\x9B\x5F\xDE" + "\xDE\x89\x3D\x42\xE7\xC9\x69\x8A" + "\x44\x6D\xC3\x88\x46\x2E\xC2\x01" + "\x5E\xF6\x0C\x39\xF0\xC4\xA5\x82" + "\xCD\xE8\x31\xCC\x0A\x4C\xE4\x44" + "\x41\xA9\x82\x6F\x22\xA1\x23\x1A" + "\xA8\xE3\x16\xFD\x31\x5C\x27\x31" + "\xF1\x7F\x01\x63\xA3\xAF\x70\xA1" + "\xCF\x07\x57\x41\x67\xD0\xC4\x42" + "\xDB\x18\xC6\x4C\x4C\xE0\x3D\x9F" + "\x05\x07\xFB\x13\x7D\x4A\xCA\x5B" + "\xF0\xBF\x64\x7E\x05\xB1\x72\xEE" + "\x7C\x3B\xD4\xCD\x14\x03\xB2\x2C" + "\xD3\xA9\xEE\xFA\x17\xFC\x9C\xDF" + "\xC7\x75\x40\xFF\xAE\xAD\x1E\x59" + "\x2F\x30\x24\xFB\xAD\x6B\x10\xFA" + "\x6C\x9F\x5B\xE7\x25\xD5\xD0\x25" + "\xAC\x4A\x4B\xDA\xFC\x7A\x85\x1B" + "\x7E\x13\x06\x82\x08\x17\xA4\x35" + "\xEC\xC5\x8D\x63\x96\x81\x0A\x8F" + "\xA3\x05\x38\x95\x20\x1A\x47\x04" + "\x6F\x6D\xDA\x8F\xEF\xC1\x76\x35" + "\x6B\xC7\x4D\x0F\x94\x12\xCA\x3E" + "\x2E\xD5\x03\x2E\x86\x7E\xAA\x3B" + "\x37\x08\x1C\xCF\xBA\x5D\x71\x46" + "\x80\x72\xB0\x4C\x82\x0D\x60\x3C", + .rlen = 208, + } +}; + +static struct aead_testvec aes_gcm_rfc4106_dec_tv_template[] = { + { /* Generated using Crypto++ */ + .key = zeroed_string, + .klen = 20, + .iv = zeroed_string, + .input = "\x03\x88\xDA\xCE\x60\xB6\xA3\x92" + "\xF3\x28\xC2\xB9\x71\xB2\xFE\x78" + "\x97\xFE\x4C\x23\x37\x42\x01\xE0" + "\x81\x9F\x8D\xC5\xD7\x41\xA0\x1B", + .ilen = 32, + .assoc = zeroed_string, + .alen = 8, + .result = zeroed_string, + .rlen = 16, + + },{ + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x01" + "\x00\x00\x00\x00", + .input = "\xC0\x0D\x8B\x42\x0F\x8F\x34\x18" + "\x88\xB1\xC5\xBC\xC5\xB6\xD6\x28" + "\x6A\x9D\xDF\x11\x5E\xFE\x5E\x9D" + "\x2F\x70\x44\x92\xF7\xF2\xE3\xEF", + .ilen = 32, + .assoc = zeroed_string, + .alen = 8, + .result = zeroed_string, + .rlen = 16, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = zeroed_string, + .input = "\x4B\xB1\xB5\xE3\x25\x71\x70\xDE" + "\x7F\xC9\x9C\xA5\x14\x19\xF2\xAC" + "\x0B\x8F\x88\x69\x17\xE6\xB4\x3C" + "\xB1\x68\xFD\x14\x52\x64\x61\xB2", + .ilen = 32, + .assoc = zeroed_string, + .alen = 8, + .result = "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01", + .rlen = 16, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = zeroed_string, + .input = "\x4B\xB1\xB5\xE3\x25\x71\x70\xDE" + "\x7F\xC9\x9C\xA5\x14\x19\xF2\xAC" + "\x90\x92\xB7\xE3\x5F\xA3\x9A\x63" + "\x7E\xD7\x1F\xD8\xD3\x7C\x4B\xF5", + .ilen = 32, + .assoc = "\x01\x01\x01\x01\x01\x01\x01\x01", + .alen = 8, + .result = "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01", + .rlen = 16, + + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x01" + "\x00\x00\x00\x00", + .input = "\xC1\x0C\x8A\x43\x0E\x8E\x35\x19" + "\x89\xB0\xC4\xBD\xC4\xB7\xD7\x29" + "\x64\x50\xF9\x32\x13\xFB\x74\x61" + "\xF4\xED\x52\xD3\xC5\x10\x55\x3C", + .ilen = 32, + .assoc = "\x01\x01\x01\x01\x01\x01\x01\x01", + .alen = 8, + .result = "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01", + .rlen = 16, + }, { + .key = "\xfe\xff\xe9\x92\x86\x65\x73\x1c" + "\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\x00\x00\x00\x00", + .klen = 20, + .iv = "\x00\x00\x00\x00\x00\x00\x00\x01" + "\x00\x00\x00\x00", + .input = "\xC1\x0C\x8A\x43\x0E\x8E\x35\x19" + "\x89\xB0\xC4\xBD\xC4\xB7\xD7\x29" + "\x98\x14\xA1\x42\x37\x80\xFD\x90" + "\x68\x12\x01\xA8\x91\x89\xB9\x83" + "\x5B\x11\x77\x12\x9B\xFF\x24\x89" + "\x94\x5F\x18\x12\xBA\x27\x09\x39" + "\x99\x96\x76\x42\x15\x1C\xCD\xCB" + "\xDC\xD3\xDA\x65\x73\xAF\x80\xCD" + "\xD2\xB6\xC2\x4A\x76\xC2\x92\x85" + "\xBD\xCF\x62\x98\x58\x14\xE5\xBD", + .ilen = 80, + .assoc = "\x01\x01\x01\x01\x01\x01\x01\x01", + .alen = 8, + .result = "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01" + "\x01\x01\x01\x01\x01\x01\x01\x01", + .rlen = 64, + }, { + .key = "\x00\x01\x02\x03\x04\x05\x06\x07" + "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" + "\x00\x00\x00\x00", + .klen = 20, + .iv = "\x00\x00\x45\x67\x89\xab\xcd\xef" + "\x00\x00\x00\x00", + .input = "\xC1\x76\x33\x85\xE2\x9B\x5F\xDE" + "\xDE\x89\x3D\x42\xE7\xC9\x69\x8A" + "\x44\x6D\xC3\x88\x46\x2E\xC2\x01" + "\x5E\xF6\x0C\x39\xF0\xC4\xA5\x82" + "\xCD\xE8\x31\xCC\x0A\x4C\xE4\x44" + "\x41\xA9\x82\x6F\x22\xA1\x23\x1A" + "\xA8\xE3\x16\xFD\x31\x5C\x27\x31" + "\xF1\x7F\x01\x63\xA3\xAF\x70\xA1" + "\xCF\x07\x57\x41\x67\xD0\xC4\x42" + "\xDB\x18\xC6\x4C\x4C\xE0\x3D\x9F" + "\x05\x07\xFB\x13\x7D\x4A\xCA\x5B" + "\xF0\xBF\x64\x7E\x05\xB1\x72\xEE" + "\x7C\x3B\xD4\xCD\x14\x03\xB2\x2C" + "\xD3\xA9\xEE\xFA\x17\xFC\x9C\xDF" + "\xC7\x75\x40\xFF\xAE\xAD\x1E\x59" + "\x2F\x30\x24\xFB\xAD\x6B\x10\xFA" + "\x6C\x9F\x5B\xE7\x25\xD5\xD0\x25" + "\xAC\x4A\x4B\xDA\xFC\x7A\x85\x1B" + "\x7E\x13\x06\x82\x08\x17\xA4\x35" + "\xEC\xC5\x8D\x63\x96\x81\x0A\x8F" + "\xA3\x05\x38\x95\x20\x1A\x47\x04" + "\x6F\x6D\xDA\x8F\xEF\xC1\x76\x35" + "\x6B\xC7\x4D\x0F\x94\x12\xCA\x3E" + "\x2E\xD5\x03\x2E\x86\x7E\xAA\x3B" + "\x37\x08\x1C\xCF\xBA\x5D\x71\x46" + "\x80\x72\xB0\x4C\x82\x0D\x60\x3C", + .ilen = 208, + .assoc = "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa" + "\xaa\xaa\xaa\xaa", + .alen = 12, + .resul